diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index d20167bcc7420..06cad69754816 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -294,9 +294,19 @@ Error YAMLProfileReader::preprocessProfile(BinaryContext &BC) { buildNameMaps(BC); // Preliminary assign function execution count. - for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs)) - if (BF) + for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs)) { + if (!BF) + continue; + if (!BF->hasProfile()) { BF->setExecutionCount(YamlBF.ExecCount); + } else { + if (opts::Verbosity >= 1) { + errs() << "BOLT-WARNING: dropping duplicate profile for " << YamlBF.Name + << '\n'; + } + BF = nullptr; + } + } return Error::success(); } diff --git a/bolt/test/X86/yaml-multiple-profiles.test b/bolt/test/X86/yaml-multiple-profiles.test new file mode 100644 index 0000000000000..d94f8dec02f14 --- /dev/null +++ b/bolt/test/X86/yaml-multiple-profiles.test @@ -0,0 +1,53 @@ +# This test ensures that a YAML profile with multiple profiles matching the same +# function is handled gracefully. + +# REQUIRES: system-linux +# RUN: split-file %s %t +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib +# RUN: llvm-bolt %t.exe -o /dev/null --data %t/profile.yaml \ +# RUN: --profile-ignore-hash -v=1 2>&1 | FileCheck %s +# CHECK: BOLT-WARNING: dropping duplicate profile for main_alias(*2) +#--- main.s + .globl main_alias + .type main_alias, %function +main_alias: + .globl main + .type main, %function +main: + .cfi_startproc + cmpl $0x0, %eax + retq + .cfi_endproc +.size main, .-main +.size main_alias, .-main_alias +#--- profile.yaml +--- +header: + profile-version: 1 + binary-name: 'yaml-multiple-profiles.test.tmp.exe' + binary-build-id: '' + profile-flags: [ lbr ] + profile-origin: branch profile reader + profile-events: '' + dfs-order: false +functions: + - name: 'main(*2)' + fid: 1 + hash: 0x50BBA3441D436491 + exec: 1 + nblocks: 1 + blocks: + - bid: 0 + insns: 2 + hash: 0x4D4D8FAF7D4C0000 + - name: 'main_alias(*2)' + fid: 1 + hash: 0x50BBA3441D436491 + exec: 1 + nblocks: 1 + blocks: + - bid: 0 + insns: 2 + hash: 0x4D4D8FAF7D4C0000 +... diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7e964f6eb435b..ba91f9481fe98 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -297,6 +297,17 @@ Bug Fixes to C++ Support definition the specialization was instantiated from. (`#26057 `_`) +- Fix a crash when a default member initializer of a base aggregate + makes an invalid call to an immediate function. + (`#66324 `_) + +- Fix crash for a lambda attribute with a statement expression + that contains a `return`. + (`#48527 `_) + +- Clang now no longer asserts when an UnresolvedLookupExpr is used as an + expression requirement. (`#66612 https://github.com/llvm/llvm-project/issues/66612`) + Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed an import failure of recursive friend class template. @@ -337,6 +348,23 @@ X86 Support Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ +Android Support +^^^^^^^^^^^^^^^ + +- Android target triples are usually suffixed with a version. Clang searches for + target-specific runtime and standard libraries in directories named after the + target (e.g. if you're building with ``--target=aarch64-none-linux-android21``, + Clang will look for ``lib/aarch64-none-linux-android21`` under its resource + directory to find runtime libraries). If an exact match isn't found, Clang + would previously fall back to a directory without any version (which would be + ``lib/aarch64-none-linux-android`` in our example). Clang will now look for + directories for lower versions and use the newest version it finds instead, + e.g. if you have ``lib/aarch64-none-linux-android21`` and + ``lib/aarch64-none-linux-android29``, ``-target aarch64-none-linux-android23`` + will use the former and ``-target aarch64-none-linux-android30`` will use the + latter. Falling back to a versionless directory will now emit a warning, and + the fallback will be removed in Clang 19. + Windows Support ^^^^^^^^^^^^^^^ - Fixed an assertion failure that occurred due to a failure to propagate @@ -425,6 +453,13 @@ Static Analyzer bitwise shift operators produce undefined behavior (because some operand is negative or too large). +- The ``alpha.security.taint.TaintPropagation`` checker no longer propagates + taint on ``strlen`` and ``strnlen`` calls, unless these are marked + explicitly propagators in the user-provided taint configuration file. + This removal empirically reduces the number of false positive reports. + Read the PR for the details. + (`#66086 `_) + .. _release-notes-sanitizers: Sanitizers diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index 54ea49e7426cc..dbd6d77878235 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -2599,8 +2599,8 @@ Default propagations rules: ``memcpy``, ``memmem``, ``memmove``, ``mbtowc``, ``pread``, ``qsort``, ``qsort_r``, ``rawmemchr``, ``read``, ``recv``, ``recvfrom``, ``rindex``, ``strcasestr``, ``strchr``, ``strchrnul``, ``strcasecmp``, ``strcmp``, - ``strcspn``, ``strlen``, ``strncasecmp``, ``strncmp``, ``strndup``, - ``strndupa``, ``strnlen``, ``strpbrk``, ``strrchr``, ``strsep``, ``strspn``, + ``strcspn``, ``strncasecmp``, ``strncmp``, ``strndup``, + ``strndupa``, ``strpbrk``, ``strrchr``, ``strsep``, ``strspn``, ``strstr``, ``strtol``, ``strtoll``, ``strtoul``, ``strtoull``, ``tolower``, ``toupper``, ``ttyname``, ``ttyname_r``, ``wctomb``, ``wcwidth`` diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h index 9d28325c1ea67..13e37ac2b56b6 100644 --- a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h +++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h @@ -361,7 +361,7 @@ class SExprBuilder { unsigned NumArgs = 0; // Function arguments - const Expr *const *FunArgs = nullptr; + llvm::PointerUnion FunArgs = nullptr; // is Self referred to with -> or .? bool SelfArrow = false; diff --git a/clang/include/clang/Analysis/CFG.h b/clang/include/clang/Analysis/CFG.h index cf4fa2da2a358..67383bb316d31 100644 --- a/clang/include/clang/Analysis/CFG.h +++ b/clang/include/clang/Analysis/CFG.h @@ -14,10 +14,11 @@ #ifndef LLVM_CLANG_ANALYSIS_CFG_H #define LLVM_CLANG_ANALYSIS_CFG_H -#include "clang/Analysis/Support/BumpVector.h" -#include "clang/Analysis/ConstructionContext.h" +#include "clang/AST/Attr.h" #include "clang/AST/ExprCXX.h" #include "clang/AST/ExprObjC.h" +#include "clang/Analysis/ConstructionContext.h" +#include "clang/Analysis/Support/BumpVector.h" #include "clang/Basic/LLVM.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/GraphTraits.h" @@ -74,7 +75,8 @@ class CFGElement { MemberDtor, TemporaryDtor, DTOR_BEGIN = AutomaticObjectDtor, - DTOR_END = TemporaryDtor + DTOR_END = TemporaryDtor, + CleanupFunction, }; protected: @@ -383,6 +385,32 @@ class CFGImplicitDtor : public CFGElement { } }; +class CFGCleanupFunction final : public CFGElement { +public: + CFGCleanupFunction() = default; + CFGCleanupFunction(const VarDecl *VD) + : CFGElement(Kind::CleanupFunction, VD) { + assert(VD->hasAttr()); + } + + const VarDecl *getVarDecl() const { + return static_cast(Data1.getPointer()); + } + + /// Returns the function to be called when cleaning up the var decl. + const FunctionDecl *getFunctionDecl() const { + const CleanupAttr *A = getVarDecl()->getAttr(); + return A->getFunctionDecl(); + } + +private: + friend class CFGElement; + + static bool isKind(const CFGElement E) { + return E.getKind() == Kind::CleanupFunction; + } +}; + /// Represents C++ object destructor implicitly generated for automatic object /// or temporary bound to const reference at the point of leaving its local /// scope. @@ -1142,6 +1170,10 @@ class CFGBlock { Elements.push_back(CFGAutomaticObjDtor(VD, S), C); } + void appendCleanupFunction(const VarDecl *VD, BumpVectorContext &C) { + Elements.push_back(CFGCleanupFunction(VD), C); + } + void appendLifetimeEnds(VarDecl *VD, Stmt *S, BumpVectorContext &C) { Elements.push_back(CFGLifetimeEnds(VD, S), C); } diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 9349ff85ca8a1..61ac792d6fda4 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -749,4 +749,10 @@ def warn_drv_missing_multilib : Warning< InGroup>; def note_drv_available_multilibs : Note< "available multilibs are:%0">; + +def warn_android_unversioned_fallback : Warning< + "Using unversioned Android target directory %0 for target %1. Unversioned" + " directories will not be used in Clang 19. Provide a versioned directory" + " for the target version or lower instead.">, + InGroup>; } diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h index 96d76c2384a9a..2d0c1f826c172 100644 --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -182,6 +182,9 @@ class ToolChain { EffectiveTriple = std::move(ET); } + std::optional + getFallbackAndroidTargetPath(StringRef BaseDir) const; + mutable std::optional cxxStdlibType; mutable std::optional runtimeLibType; mutable std::optional unwindLibType; diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp index b82f9010a83f7..03ab4c6fdf29c 100644 --- a/clang/lib/Analysis/CFG.cpp +++ b/clang/lib/Analysis/CFG.cpp @@ -881,6 +881,10 @@ class CFGBuilder { B->appendAutomaticObjDtor(VD, S, cfg->getBumpVectorContext()); } + void appendCleanupFunction(CFGBlock *B, VarDecl *VD) { + B->appendCleanupFunction(VD, cfg->getBumpVectorContext()); + } + void appendLifetimeEnds(CFGBlock *B, VarDecl *VD, Stmt *S) { B->appendLifetimeEnds(VD, S, cfg->getBumpVectorContext()); } @@ -1346,7 +1350,8 @@ class CFGBuilder { return {}; } - bool hasTrivialDestructor(VarDecl *VD); + bool hasTrivialDestructor(const VarDecl *VD) const; + bool needsAutomaticDestruction(const VarDecl *VD) const; }; } // namespace @@ -1861,14 +1866,14 @@ void CFGBuilder::addAutomaticObjDestruction(LocalScope::const_iterator B, if (B == E) return; - SmallVector DeclsNonTrivial; - DeclsNonTrivial.reserve(B.distance(E)); + SmallVector DeclsNeedDestruction; + DeclsNeedDestruction.reserve(B.distance(E)); for (VarDecl* D : llvm::make_range(B, E)) - if (!hasTrivialDestructor(D)) - DeclsNonTrivial.push_back(D); + if (needsAutomaticDestruction(D)) + DeclsNeedDestruction.push_back(D); - for (VarDecl *VD : llvm::reverse(DeclsNonTrivial)) { + for (VarDecl *VD : llvm::reverse(DeclsNeedDestruction)) { if (BuildOpts.AddImplicitDtors) { // If this destructor is marked as a no-return destructor, we need to // create a new block for the destructor which does not have as a @@ -1879,7 +1884,8 @@ void CFGBuilder::addAutomaticObjDestruction(LocalScope::const_iterator B, Ty = getReferenceInitTemporaryType(VD->getInit()); Ty = Context->getBaseElementType(Ty); - if (Ty->getAsCXXRecordDecl()->isAnyDestructorNoReturn()) + const CXXRecordDecl *CRD = Ty->getAsCXXRecordDecl(); + if (CRD && CRD->isAnyDestructorNoReturn()) Block = createNoReturnBlock(); } @@ -1890,8 +1896,10 @@ void CFGBuilder::addAutomaticObjDestruction(LocalScope::const_iterator B, // objects, we end lifetime with scope end. if (BuildOpts.AddLifetime) appendLifetimeEnds(Block, VD, S); - if (BuildOpts.AddImplicitDtors) + if (BuildOpts.AddImplicitDtors && !hasTrivialDestructor(VD)) appendAutomaticObjDtor(Block, VD, S); + if (VD->hasAttr()) + appendCleanupFunction(Block, VD); } } @@ -1922,7 +1930,7 @@ void CFGBuilder::addScopeExitHandling(LocalScope::const_iterator B, // is destroyed, for automatic variables, this happens when the end of the // scope is added. for (VarDecl* D : llvm::make_range(B, E)) - if (hasTrivialDestructor(D)) + if (!needsAutomaticDestruction(D)) DeclsTrivial.push_back(D); if (DeclsTrivial.empty()) @@ -2095,7 +2103,11 @@ LocalScope* CFGBuilder::addLocalScopeForDeclStmt(DeclStmt *DS, return Scope; } -bool CFGBuilder::hasTrivialDestructor(VarDecl *VD) { +bool CFGBuilder::needsAutomaticDestruction(const VarDecl *VD) const { + return !hasTrivialDestructor(VD) || VD->hasAttr(); +} + +bool CFGBuilder::hasTrivialDestructor(const VarDecl *VD) const { // Check for const references bound to temporary. Set type to pointee. QualType QT = VD->getType(); if (QT->isReferenceType()) { @@ -2149,7 +2161,7 @@ LocalScope* CFGBuilder::addLocalScopeForVarDecl(VarDecl *VD, return Scope; if (!BuildOpts.AddLifetime && !BuildOpts.AddScopes && - hasTrivialDestructor(VD)) { + !needsAutomaticDestruction(VD)) { assert(BuildOpts.AddImplicitDtors); return Scope; } @@ -5287,6 +5299,7 @@ CFGImplicitDtor::getDestructorDecl(ASTContext &astContext) const { case CFGElement::CXXRecordTypedCall: case CFGElement::ScopeBegin: case CFGElement::ScopeEnd: + case CFGElement::CleanupFunction: llvm_unreachable("getDestructorDecl should only be used with " "ImplicitDtors"); case CFGElement::AutomaticObjectDtor: { @@ -5830,6 +5843,11 @@ static void print_elem(raw_ostream &OS, StmtPrinterHelper &Helper, break; } + case CFGElement::Kind::CleanupFunction: + OS << "CleanupFunction (" + << E.castAs().getFunctionDecl()->getName() << ")\n"; + break; + case CFGElement::Kind::LifetimeEnds: Helper.handleDecl(E.castAs().getVarDecl(), OS); OS << " (Lifetime ends)\n"; diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp index 26e0973490572..9dc528567038a 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -289,11 +289,14 @@ static void insertIfFunction(const Decl &D, } static MemberExpr *getMemberForAccessor(const CXXMemberCallExpr &C) { + if (!C.getMethodDecl()) + return nullptr; auto *Body = dyn_cast_or_null(C.getMethodDecl()->getBody()); if (!Body || Body->size() != 1) return nullptr; if (auto *RS = dyn_cast(*Body->body_begin())) - return dyn_cast(RS->getRetValue()->IgnoreParenImpCasts()); + if (auto *Return = RS->getRetValue()) + return dyn_cast(Return->IgnoreParenImpCasts()); return nullptr; } diff --git a/clang/lib/Analysis/PathDiagnostic.cpp b/clang/lib/Analysis/PathDiagnostic.cpp index 348afc42319e5..0cb03943c547c 100644 --- a/clang/lib/Analysis/PathDiagnostic.cpp +++ b/clang/lib/Analysis/PathDiagnostic.cpp @@ -567,6 +567,7 @@ getLocationForCaller(const StackFrameContext *SFC, } case CFGElement::ScopeBegin: case CFGElement::ScopeEnd: + case CFGElement::CleanupFunction: llvm_unreachable("not yet implemented!"); case CFGElement::LifetimeEnds: case CFGElement::LoopExit: diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp index 34260ac8f4e7d..3e6ceb7d54c42 100644 --- a/clang/lib/Analysis/ThreadSafety.cpp +++ b/clang/lib/Analysis/ThreadSafety.cpp @@ -1773,7 +1773,8 @@ void BuildLockset::checkPtAccess(const Expr *Exp, AccessKind AK, /// /// \param Exp The call expression. /// \param D The callee declaration. -/// \param Self If \p Exp = nullptr, the implicit this argument. +/// \param Self If \p Exp = nullptr, the implicit this argument or the argument +/// of an implicitly called cleanup function. /// \param Loc If \p Exp = nullptr, the location. void BuildLockset::handleCall(const Expr *Exp, const NamedDecl *D, til::LiteralPtr *Self, SourceLocation Loc) { @@ -2265,8 +2266,11 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) { const PostOrderCFGView *SortedGraph = walker.getSortedGraph(); PostOrderCFGView::CFGBlockSet VisitedBlocks(CFGraph); + CFGBlockInfo &Initial = BlockInfo[CFGraph->getEntry().getBlockID()]; + CFGBlockInfo &Final = BlockInfo[CFGraph->getExit().getBlockID()]; + // Mark entry block as reachable - BlockInfo[CFGraph->getEntry().getBlockID()].Reachable = true; + Initial.Reachable = true; // Compute SSA names for local variables LocalVarMap.traverseCFG(CFGraph, SortedGraph, BlockInfo); @@ -2282,8 +2286,8 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) { // to initial lockset. Also turn off checking for lock and unlock functions. // FIXME: is there a more intelligent way to check lock/unlock functions? if (!SortedGraph->empty() && D->hasAttrs()) { - const CFGBlock *FirstBlock = *SortedGraph->begin(); - FactSet &InitialLockset = BlockInfo[FirstBlock->getBlockID()].EntrySet; + assert(*SortedGraph->begin() == &CFGraph->getEntry()); + FactSet &InitialLockset = Initial.EntrySet; CapExprSet ExclusiveLocksToAdd; CapExprSet SharedLocksToAdd; @@ -2414,6 +2418,15 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) { AD.getTriggerStmt()->getEndLoc()); break; } + + case CFGElement::CleanupFunction: { + const CFGCleanupFunction &CF = BI.castAs(); + LocksetBuilder.handleCall(/*Exp=*/nullptr, CF.getFunctionDecl(), + SxBuilder.createVariable(CF.getVarDecl()), + CF.getVarDecl()->getLocation()); + break; + } + case CFGElement::TemporaryDtor: { auto TD = BI.castAs(); @@ -2455,15 +2468,12 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) { } } - CFGBlockInfo *Initial = &BlockInfo[CFGraph->getEntry().getBlockID()]; - CFGBlockInfo *Final = &BlockInfo[CFGraph->getExit().getBlockID()]; - // Skip the final check if the exit block is unreachable. - if (!Final->Reachable) + if (!Final.Reachable) return; // By default, we expect all locks held on entry to be held on exit. - FactSet ExpectedExitSet = Initial->EntrySet; + FactSet ExpectedExitSet = Initial.EntrySet; // Adjust the expected exit set by adding or removing locks, as declared // by *-LOCK_FUNCTION and UNLOCK_FUNCTION. The intersect below will then @@ -2479,7 +2489,7 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) { ExpectedExitSet.removeLock(FactMan, Lock); // FIXME: Should we call this function for all blocks which exit the function? - intersectAndWarn(ExpectedExitSet, Final->ExitSet, Final->ExitLoc, + intersectAndWarn(ExpectedExitSet, Final.ExitSet, Final.ExitLoc, LEK_LockedAtEndOfFunction, LEK_NotLockedAtEndOfFunction); Handler.leaveFunction(CurrentFunction); diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp index b8286cef396c0..63cc66852a9eb 100644 --- a/clang/lib/Analysis/ThreadSafetyCommon.cpp +++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp @@ -110,7 +110,8 @@ static StringRef ClassifyDiagnostic(QualType VDT) { /// \param D The declaration to which the attribute is attached. /// \param DeclExp An expression involving the Decl to which the attribute /// is attached. E.g. the call to a function. -/// \param Self S-expression to substitute for a \ref CXXThisExpr. +/// \param Self S-expression to substitute for a \ref CXXThisExpr in a call, +/// or argument to a cleanup function. CapabilityExpr SExprBuilder::translateAttrExpr(const Expr *AttrExp, const NamedDecl *D, const Expr *DeclExp, @@ -144,7 +145,11 @@ CapabilityExpr SExprBuilder::translateAttrExpr(const Expr *AttrExp, if (Self) { assert(!Ctx.SelfArg && "Ambiguous self argument"); - Ctx.SelfArg = Self; + assert(isa(D) && "Self argument requires function"); + if (isa(D)) + Ctx.SelfArg = Self; + else + Ctx.FunArgs = Self; // If the attribute has no arguments, then assume the argument is "this". if (!AttrExp) @@ -312,8 +317,14 @@ til::SExpr *SExprBuilder::translateDeclRefExpr(const DeclRefExpr *DRE, ? (cast(D)->getCanonicalDecl() == Canonical) : (cast(D)->getCanonicalDecl() == Canonical)) { // Substitute call arguments for references to function parameters - assert(I < Ctx->NumArgs); - return translate(Ctx->FunArgs[I], Ctx->Prev); + if (const Expr *const *FunArgs = + Ctx->FunArgs.dyn_cast()) { + assert(I < Ctx->NumArgs); + return translate(FunArgs[I], Ctx->Prev); + } + + assert(I == 0); + return Ctx->FunArgs.get(); } } // Map the param back to the param of the original function declaration diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 52868ca260290..f727a0d5592ef 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2312,8 +2312,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, const unsigned BuiltinIDIfNoAsmLabel = FD->hasAttr() ? 0 : BuiltinID; - bool ErrnoOverriden = false; - // True if math-errno is overriden via the + std::optional ErrnoOverriden; + // ErrnoOverriden is true if math-errno is overriden via the // '#pragma float_control(precise, on)'. This pragma disables fast-math, // which implies math-errno. if (E->hasStoredFPFeatures()) { @@ -2329,8 +2329,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, // using the '#pragma float_control(precise, off)', and // attribute opt-none hasn't been seen. bool ErrnoOverridenToFalseWithOpt = - !ErrnoOverriden && !OptNone && - CGM.getCodeGenOpts().OptimizationLevel != 0; + ErrnoOverriden.has_value() && !ErrnoOverriden.value() && !OptNone && + CGM.getCodeGenOpts().OptimizationLevel != 0; // There are LLVM math intrinsics/instructions corresponding to math library // functions except the LLVM op will never set errno while the math library @@ -2339,6 +2339,30 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, // LLVM counterparts if the call is marked 'const' (known to never set errno). // In case FP exceptions are enabled, the experimental versions of the // intrinsics model those. + bool ConstAlways = + getContext().BuiltinInfo.isConst(BuiltinID); + + // There's a special case with the fma builtins where they are always const + // if the target environment is GNU or the target is OS is Windows and we're + // targeting the MSVCRT.dll environment. + // FIXME: This list can be become outdated. Need to find a way to get it some + // other way. + switch (BuiltinID) { + case Builtin::BI__builtin_fma: + case Builtin::BI__builtin_fmaf: + case Builtin::BI__builtin_fmal: + case Builtin::BIfma: + case Builtin::BIfmaf: + case Builtin::BIfmal: { + auto &Trip = CGM.getTriple(); + if (Trip.isGNUEnvironment() || Trip.isOSMSVCRT()) + ConstAlways = true; + break; + } + default: + break; + } + bool ConstWithoutErrnoAndExceptions = getContext().BuiltinInfo.isConstWithoutErrnoAndExceptions(BuiltinID); bool ConstWithoutExceptions = @@ -2362,14 +2386,17 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, bool ConstWithoutErrnoOrExceptions = ConstWithoutErrnoAndExceptions || ConstWithoutExceptions; bool GenerateIntrinsics = - FD->hasAttr() && !ErrnoOverriden && !OptNone; + (ConstAlways && !OptNone) || + (!getLangOpts().MathErrno && + !(ErrnoOverriden.has_value() && ErrnoOverriden.value()) && !OptNone); if (!GenerateIntrinsics) { GenerateIntrinsics = ConstWithoutErrnoOrExceptions && !ConstWithoutErrnoAndExceptions; if (!GenerateIntrinsics) GenerateIntrinsics = ConstWithoutErrnoOrExceptions && - (!getLangOpts().MathErrno && !ErrnoOverriden && !OptNone); + (!getLangOpts().MathErrno && + !(ErrnoOverriden.has_value() && ErrnoOverriden.value()) && !OptNone); if (!GenerateIntrinsics) GenerateIntrinsics = ConstWithoutErrnoOrExceptions && ErrnoOverridenToFalseWithOpt; diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 3a7e90e6b6377..31245964c4baa 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -677,6 +677,50 @@ const char *ToolChain::getCompilerRTArgString(const llvm::opt::ArgList &Args, return Args.MakeArgString(getCompilerRT(Args, Component, Type)); } +// Android target triples contain a target version. If we don't have libraries +// for the exact target version, we should fall back to the next newest version +// or a versionless path, if any. +std::optional +ToolChain::getFallbackAndroidTargetPath(StringRef BaseDir) const { + llvm::Triple TripleWithoutLevel(getTriple()); + TripleWithoutLevel.setEnvironmentName("android"); // remove any version number + const std::string &TripleWithoutLevelStr = TripleWithoutLevel.str(); + unsigned TripleVersion = getTriple().getEnvironmentVersion().getMajor(); + unsigned BestVersion = 0; + + SmallString<32> TripleDir; + bool UsingUnversionedDir = false; + std::error_code EC; + for (llvm::vfs::directory_iterator LI = getVFS().dir_begin(BaseDir, EC), LE; + !EC && LI != LE; LI = LI.increment(EC)) { + StringRef DirName = llvm::sys::path::filename(LI->path()); + StringRef DirNameSuffix = DirName; + if (DirNameSuffix.consume_front(TripleWithoutLevelStr)) { + if (DirNameSuffix.empty() && TripleDir.empty()) { + TripleDir = DirName; + UsingUnversionedDir = true; + } else { + unsigned Version; + if (!DirNameSuffix.getAsInteger(10, Version) && Version > BestVersion && + Version < TripleVersion) { + BestVersion = Version; + TripleDir = DirName; + UsingUnversionedDir = false; + } + } + } + } + + if (TripleDir.empty()) + return {}; + + SmallString<128> P(BaseDir); + llvm::sys::path::append(P, TripleDir); + if (UsingUnversionedDir) + D.Diag(diag::warn_android_unversioned_fallback) << P << getTripleString(); + return std::string(P); +} + std::optional ToolChain::getTargetSubDirPath(StringRef BaseDir) const { auto getPathForTriple = @@ -713,15 +757,8 @@ ToolChain::getTargetSubDirPath(StringRef BaseDir) const { return *Path; } - // Android targets may include an API level at the end. We still want to fall - // back on a path without the API level. - if (getTriple().isAndroid() && - getTriple().getEnvironmentName() != "android") { - llvm::Triple TripleWithoutLevel = getTriple(); - TripleWithoutLevel.setEnvironmentName("android"); - if (auto Path = getPathForTriple(TripleWithoutLevel)) - return *Path; - } + if (getTriple().isAndroid()) + return getFallbackAndroidTargetPath(BaseDir); return {}; } diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 1f2bb1edd3b7b..83a5674092b26 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -2514,12 +2514,15 @@ void Sema::DiagnoseImmediateEscalatingReason(FunctionDecl *FD) { Range = CurrentInit->isWritten() ? CurrentInit->getSourceRange() : SourceRange(); } + + FieldDecl* InitializedField = CurrentInit ? CurrentInit->getAnyMember() : nullptr; + SemaRef.Diag(Loc, diag::note_immediate_function_reason) << ImmediateFn << Fn << Fn->isConsteval() << IsCall << isa(Fn) << ImmediateFnIsConstructor - << (CurrentInit != nullptr) + << (InitializedField != nullptr) << (CurrentInit && !CurrentInit->isWritten()) - << (CurrentInit ? CurrentInit->getAnyMember() : nullptr) << Range; + << InitializedField << Range; } bool TraverseCallExpr(CallExpr *E) { if (const auto *DR = diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index bb4ef065d5c72..77289595972e3 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -9063,7 +9063,8 @@ Sema::BuildExprRequirement( concepts::ExprRequirement::ReturnTypeRequirement ReturnTypeRequirement) { auto Status = concepts::ExprRequirement::SS_Satisfied; ConceptSpecializationExpr *SubstitutedConstraintExpr = nullptr; - if (E->isInstantiationDependent() || ReturnTypeRequirement.isDependent()) + if (E->isInstantiationDependent() || E->getType()->isPlaceholderType() || + ReturnTypeRequirement.isDependent()) Status = concepts::ExprRequirement::SS_Dependent; else if (NoexceptLoc.isValid() && canThrow(E) == CanThrowResult::CT_Can) Status = concepts::ExprRequirement::SS_NoexceptNotMet; diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 7cc509542d538..10adfbc406dfb 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3577,6 +3577,8 @@ StmtResult Sema::ActOnCapScopeReturnStmt(SourceLocation ReturnLoc, CapturingScopeInfo *CurCap = cast(getCurFunction()); QualType FnRetType = CurCap->ReturnType; LambdaScopeInfo *CurLambda = dyn_cast(CurCap); + if (CurLambda && CurLambda->CallOperator->getType().isNull()) + return StmtError(); bool HasDeducedReturnType = CurLambda && hasDeducedReturnType(CurLambda->CallOperator); diff --git a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp index 5da0f34b3d046..dae8ff0c5c8f1 100644 --- a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp @@ -695,9 +695,10 @@ void GenericTaintChecker::initTaintRules(CheckerContext &C) const { {{{"strpbrk"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, {{{"strndup"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, {{{"strndupa"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{{"strlen"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{{"wcslen"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{{"strnlen"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, + + // strlen, wcslen, strnlen and alike intentionally don't propagate taint. + // See the details here: https://github.com/llvm/llvm-project/pull/66086 + {{{"strtol"}}, TR::Prop({{0}}, {{1, ReturnValueIndex}})}, {{{"strtoll"}}, TR::Prop({{0}}, {{1, ReturnValueIndex}})}, {{{"strtoul"}}, TR::Prop({{0}}, {{1, ReturnValueIndex}})}, diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 0e2ac78f7089c..d7c5bd1d40423 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -993,6 +993,7 @@ void ExprEngine::processCFGElement(const CFGElement E, ExplodedNode *Pred, ProcessLoopExit(E.castAs().getLoopStmt(), Pred); return; case CFGElement::LifetimeEnds: + case CFGElement::CleanupFunction: case CFGElement::ScopeBegin: case CFGElement::ScopeEnd: return; diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp index f827f43eaa7da..4fe828bdf7681 100644 --- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp +++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp @@ -598,9 +598,11 @@ SVal SValBuilder::evalIntegralCast(ProgramStateRef state, SVal val, APSIntType ToType(getContext().getTypeSize(castTy), castTy->isUnsignedIntegerType()); llvm::APSInt ToTypeMax = ToType.getMaxValue(); - - NonLoc ToTypeMaxVal = makeIntVal(ToTypeMax); - + NonLoc ToTypeMaxVal = + makeIntVal(ToTypeMax.isUnsigned() ? ToTypeMax.getZExtValue() + : ToTypeMax.getSExtValue(), + castTy) + .castAs(); // Check the range of the symbol being casted against the maximum value of the // target type. NonLoc FromVal = val.castAs(); diff --git a/clang/test/Analysis/bitint-no-crash.c b/clang/test/Analysis/bitint-no-crash.c deleted file mode 100644 index 6fa041974a3c9..0000000000000 --- a/clang/test/Analysis/bitint-no-crash.c +++ /dev/null @@ -1,11 +0,0 @@ - // RUN: %clang_analyze_cc1 -analyzer-checker=core \ - // RUN: -analyzer-checker=debug.ExprInspection \ - // RUN: -verify %s - -// Don't crash when using _BitInt() -// expected-no-diagnostics -_BitInt(256) a; -_BitInt(129) b; -void c() { - b = a; -} diff --git a/clang/test/Analysis/scopes-cfg-output.cpp b/clang/test/Analysis/scopes-cfg-output.cpp index 6877d124e67a4..4eb8967e37351 100644 --- a/clang/test/Analysis/scopes-cfg-output.cpp +++ b/clang/test/Analysis/scopes-cfg-output.cpp @@ -1419,3 +1419,68 @@ void test_multiple_goto_entering_scopes() { } } } + +// CHECK: [B1] +// CHECK-NEXT: 1: CFGScopeBegin(i) +// CHECK-NEXT: 2: int i __attribute__((cleanup(cleanup_int))); +// CHECK-NEXT: 3: CleanupFunction (cleanup_int) +// CHECK-NEXT: 4: CFGScopeEnd(i) +void cleanup_int(int *i); +void test_cleanup_functions() { + int i __attribute__((cleanup(cleanup_int))); +} + +// CHECK: [B1] +// CHECK-NEXT: 1: 10 +// CHECK-NEXT: 2: i +// CHECK-NEXT: 3: [B1.2] = [B1.1] +// CHECK-NEXT: 4: return; +// CHECK-NEXT: 5: CleanupFunction (cleanup_int) +// CHECK-NEXT: 6: CFGScopeEnd(i) +// CHECK-NEXT: Preds (1): B3 +// CHECK-NEXT: Succs (1): B0 +// CHECK: [B2] +// CHECK-NEXT: 1: return; +// CHECK-NEXT: 2: CleanupFunction (cleanup_int) +// CHECK-NEXT: 3: CFGScopeEnd(i) +// CHECK-NEXT: Preds (1): B3 +// CHECK-NEXT: Succs (1): B0 +// CHECK: [B3] +// CHECK-NEXT: 1: CFGScopeBegin(i) +// CHECK-NEXT: 2: int i __attribute__((cleanup(cleanup_int))); +// CHECK-NEXT: 3: m +// CHECK-NEXT: 4: [B3.3] (ImplicitCastExpr, LValueToRValue, int) +// CHECK-NEXT: 5: 1 +// CHECK-NEXT: 6: [B3.4] == [B3.5] +// CHECK-NEXT: T: if [B3.6] +// CHECK-NEXT: Preds (1): B4 +// CHECK-NEXT: Succs (2): B2 B1 +void test_cleanup_functions2(int m) { + int i __attribute__((cleanup(cleanup_int))); + + if (m == 1) { + return; + } + + i = 10; + return; +} + +// CHECK: [B1] +// CHECK-NEXT: 1: CFGScopeBegin(f) +// CHECK-NEXT: 2: (CXXConstructExpr, [B1.3], F) +// CHECK-NEXT: 3: __attribute__((cleanup(cleanup_F))) F f; +// CHECK-NEXT: 4: CleanupFunction (cleanup_F) +// CHECK-NEXT: 5: [B1.3].~F() (Implicit destructor) +// CHECK-NEXT: 6: CFGScopeEnd(f) +// CHECK-NEXT: Preds (1): B2 +// CHECK-NEXT: Succs (1): B0 +class F { +public: + ~F(); +}; +void cleanup_F(F *f); + +void test() { + F f __attribute((cleanup(cleanup_F))); +} diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c index f1b9ceebdd9a6..8a7510177f3e4 100644 --- a/clang/test/Analysis/taint-diagnostic-visitor.c +++ b/clang/test/Analysis/taint-diagnostic-visitor.c @@ -10,6 +10,7 @@ int scanf(const char *restrict format, ...); int system(const char *command); char* getenv( const char* env_var ); size_t strlen( const char* str ); +int atoi( const char* str ); void *malloc(size_t size ); void free( void *ptr ); char *fgets(char *str, int n, FILE *stream); @@ -54,11 +55,11 @@ void taintDiagnosticVLA(void) { // propagating through variables and expressions char *taintDiagnosticPropagation(){ char *pathbuf; - char *pathlist=getenv("PATH"); // expected-note {{Taint originated here}} + char *size=getenv("SIZE"); // expected-note {{Taint originated here}} // expected-note@-1 {{Taint propagated to the return value}} - if (pathlist){ // expected-note {{Assuming 'pathlist' is non-null}} + if (size){ // expected-note {{Assuming 'size' is non-null}} // expected-note@-1 {{Taking true branch}} - pathbuf=(char*) malloc(strlen(pathlist)+1); // expected-warning{{Untrusted data is used to specify the buffer size}} + pathbuf=(char*) malloc(atoi(size)); // expected-warning{{Untrusted data is used to specify the buffer size}} // expected-note@-1{{Untrusted data is used to specify the buffer size}} // expected-note@-2 {{Taint propagated to the return value}} return pathbuf; @@ -71,12 +72,12 @@ char *taintDiagnosticPropagation(){ char *taintDiagnosticPropagation2(){ char *pathbuf; char *user_env2=getenv("USER_ENV_VAR2");//unrelated taint source - char *pathlist=getenv("PATH"); // expected-note {{Taint originated here}} + char *size=getenv("SIZE"); // expected-note {{Taint originated here}} // expected-note@-1 {{Taint propagated to the return value}} char *user_env=getenv("USER_ENV_VAR");//unrelated taint source - if (pathlist){ // expected-note {{Assuming 'pathlist' is non-null}} + if (size){ // expected-note {{Assuming 'size' is non-null}} // expected-note@-1 {{Taking true branch}} - pathbuf=(char*) malloc(strlen(pathlist)+1); // expected-warning{{Untrusted data is used to specify the buffer size}} + pathbuf=(char*) malloc(atoi(size)+1); // expected-warning{{Untrusted data is used to specify the buffer size}} // expected-note@-1{{Untrusted data is used to specify the buffer size}} // expected-note@-2 {{Taint propagated to the return value}} return pathbuf; diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c index c674a03cfaec6..10a52cb442595 100644 --- a/clang/test/Analysis/taint-generic.c +++ b/clang/test/Analysis/taint-generic.c @@ -443,14 +443,18 @@ int testSprintf_propagates_taint(char *buf, char *msg) { return 1 / x; // expected-warning {{Division by a tainted value, possibly zero}} } -void test_wchar_apis_propagate(const char *path) { +void test_wchar_apis_dont_propagate(const char *path) { + // strlen, wcslen, strnlen and alike intentionally don't propagate taint. + // See the details here: https://github.com/llvm/llvm-project/pull/66086 + // This isn't ideal, but this is only what we have now. + FILE *f = fopen(path, "r"); clang_analyzer_isTainted_charp((char*)f); // expected-warning {{YES}} wchar_t wbuf[10]; fgetws(wbuf, sizeof(wbuf)/sizeof(*wbuf), f); clang_analyzer_isTainted_wchar(*wbuf); // expected-warning {{YES}} int n = wcslen(wbuf); - clang_analyzer_isTainted_int(n); // expected-warning {{YES}} + clang_analyzer_isTainted_int(n); // expected-warning {{NO}} wchar_t dst[100] = L"ABC"; clang_analyzer_isTainted_wchar(*dst); // expected-warning {{NO}} @@ -458,7 +462,7 @@ void test_wchar_apis_propagate(const char *path) { clang_analyzer_isTainted_wchar(*dst); // expected-warning {{YES}} int m = wcslen(dst); - clang_analyzer_isTainted_int(m); // expected-warning {{YES}} + clang_analyzer_isTainted_int(m); // expected-warning {{NO}} } int scanf_s(const char *format, ...); @@ -948,21 +952,27 @@ void testStrndupa(size_t n) { } size_t strlen(const char *s); -void testStrlen() { +void testStrlen_dont_propagate() { + // strlen, wcslen, strnlen and alike intentionally don't propagate taint. + // See the details here: https://github.com/llvm/llvm-project/pull/66086 + // This isn't ideal, but this is only what we have now. char s[10]; scanf("%9s", s); size_t result = strlen(s); - clang_analyzer_isTainted_int(result); // expected-warning {{YES}} + // strlen propagating taint would bring in many false positives + clang_analyzer_isTainted_int(result); // expected-warning {{NO}} } size_t strnlen(const char *s, size_t maxlen); -void testStrnlen(size_t maxlen) { +void testStrnlen_dont_propagate(size_t maxlen) { + // strlen, wcslen, strnlen and alike intentionally don't propagate taint. + // See the details here: https://github.com/llvm/llvm-project/pull/66086 + // This isn't ideal, but this is only what we have now. char s[10]; scanf("%9s", s); - size_t result = strnlen(s, maxlen); - clang_analyzer_isTainted_int(result); // expected-warning {{YES}} + clang_analyzer_isTainted_int(result); // expected-warning {{NO}} } long strtol(const char *restrict nptr, char **restrict endptr, int base); diff --git a/clang/test/CodeGen/math-builtins.c b/clang/test/CodeGen/math-builtins.c index 962e311698f57..554c604219957 100644 --- a/clang/test/CodeGen/math-builtins.c +++ b/clang/test/CodeGen/math-builtins.c @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm %s | FileCheck %s -check-prefix=NO__ERRNO // RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -disable-llvm-passes -O2 %s | FileCheck %s -check-prefix=NO__ERRNO +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -w -S -o - -emit-llvm -disable-llvm-passes -O2 -fmath-errno %s | FileCheck %s -check-prefix=HAS_ERRNO // RUN: %clang_cc1 -triple x86_64-unknown-unknown-gnu -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU // RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index fa8f49d8a2c9f..02df4fe5fea60 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm %s | FileCheck %s --check-prefix=NO__ERRNO // RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -disable-llvm-passes -O2 %s | FileCheck %s --check-prefix=NO__ERRNO +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -disable-llvm-passes -O2 -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO // RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-implicit-function-declaration -w -S -o - -emit-llvm -ffp-exception-behavior=maytrap %s | FileCheck %s --check-prefix=HAS_MAYTRAP // RUN: %clang_cc1 -triple x86_64-unknown-unknown-gnu -Wno-implicit-function-declaration -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_GNU // RUN: %clang_cc1 -triple x86_64-unknown-windows-msvc -Wno-implicit-function-declaration -w -S -o - -emit-llvm -fmath-errno %s | FileCheck %s --check-prefix=HAS_ERRNO_WIN diff --git a/clang/test/Driver/Inputs/basic_android_libcxx_tree/usr/lib/aarch64-unknown-linux-android21/libc++.so b/clang/test/Driver/Inputs/basic_android_libcxx_tree/usr/lib/aarch64-unknown-linux-android23/libc++.so similarity index 100% rename from clang/test/Driver/Inputs/basic_android_libcxx_tree/usr/lib/aarch64-unknown-linux-android21/libc++.so rename to clang/test/Driver/Inputs/basic_android_libcxx_tree/usr/lib/aarch64-unknown-linux-android23/libc++.so diff --git a/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/aarch64-unknown-linux-android21/libclang_rt.builtins.a b/clang/test/Driver/Inputs/basic_android_libcxx_tree/usr/lib/aarch64-unknown-linux-android29/libc++.so similarity index 100% rename from clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/aarch64-unknown-linux-android21/libclang_rt.builtins.a rename to clang/test/Driver/Inputs/basic_android_libcxx_tree/usr/lib/aarch64-unknown-linux-android29/libc++.so diff --git a/llvm-libgcc/lib/blank.c b/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/aarch64-unknown-linux-android23/libclang_rt.builtins.a similarity index 100% rename from llvm-libgcc/lib/blank.c rename to clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/aarch64-unknown-linux-android23/libclang_rt.builtins.a diff --git a/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/aarch64-unknown-linux-android29/libclang_rt.builtins.a b/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/aarch64-unknown-linux-android29/libclang_rt.builtins.a new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/android-installed-libcxx.cpp b/clang/test/Driver/android-installed-libcxx.cpp index 4d139a1fb3693..14856e26e2730 100644 --- a/clang/test/Driver/android-installed-libcxx.cpp +++ b/clang/test/Driver/android-installed-libcxx.cpp @@ -13,6 +13,7 @@ // RUN: mkdir -p %t2/include/c++/v1 // RUN: mkdir -p %t2/sysroot // RUN: mkdir -p %t2/include/aarch64-none-linux-android/c++/v1 +// RUN: mkdir -p %t2/include/aarch64-none-linux-android23/c++/v1 // RUN: %clang -target aarch64-none-linux-android -ccc-install-dir %/t2/bin \ // RUN: --sysroot=%t2/sysroot -stdlib=libc++ -fsyntax-only \ @@ -24,3 +25,14 @@ // ANDROID-DIR: "-internal-isystem" "[[DIR]][[SEP:/|\\\\]]..[[SEP]]include[[SEP]]aarch64-none-linux-android[[SEP]]c++[[SEP]]v1" // ANDROID-DIR-SAME: "-internal-isystem" "[[DIR]][[SEP]]..[[SEP]]include[[SEP]]c++[[SEP]]v1" + +// RUN: %clang -target aarch64-none-linux-android23 -ccc-install-dir %/t2/bin \ +// RUN: --sysroot=%t2/sysroot -stdlib=libc++ -fsyntax-only \ +// RUN: %s -### 2>&1 | FileCheck --check-prefix=ANDROID23-DIR -DDIR=%/t2/bin %s + +// RUN: %clang -target aarch64-none-linux-android28 -ccc-install-dir %/t2/bin \ +// RUN: --sysroot=%t2/sysroot -stdlib=libc++ -fsyntax-only \ +// RUN: %s -### 2>&1 | FileCheck --check-prefix=ANDROID23-DIR -DDIR=%/t2/bin %s + +// ANDROID23-DIR: "-internal-isystem" "[[DIR]][[SEP:/|\\\\]]..[[SEP]]include[[SEP]]aarch64-none-linux-android23[[SEP]]c++[[SEP]]v1" +// ANDROID23-DIR-SAME: "-internal-isystem" "[[DIR]][[SEP]]..[[SEP]]include[[SEP]]c++[[SEP]]v1" diff --git a/clang/test/Driver/android-unversioned-fallback-warning.cpp b/clang/test/Driver/android-unversioned-fallback-warning.cpp new file mode 100644 index 0000000000000..62a951d14effa --- /dev/null +++ b/clang/test/Driver/android-unversioned-fallback-warning.cpp @@ -0,0 +1,33 @@ +// Check that we emit warnings for using unversioned Android target directories +// as appropriate. + +// RUN: mkdir -p %t/bin +// RUN: mkdir -p %t/include/aarch64-none-linux-android/c++/v1 +// RUN: mkdir -p %t/include/aarch64-none-linux-android23/c++/v1 +// RUN: mkdir -p %t/include/c++/v1 +// RUN: mkdir -p %t/lib/aarch64-none-linux-android +// RUN: mkdir -p %t/lib/aarch64-none-linux-android23 +// RUN: mkdir -p %t/resource/lib/aarch64-none-linux-android +// RUN: mkdir -p %t/resource/lib/aarch64-none-linux-android23 + +// Using an unversioned directory for an unversioned triple isn't a warning. +// RUN: %clang --target=aarch64-none-linux-android -ccc-install-dir %t/bin \ +// RUN: -resource-dir %t/resource -### -c %s 2>&1 | \ +// RUN: FileCheck --check-prefix=NO-WARNING %s +// NO-WARNING-NOT: Using unversioned Android target directory + +// RUN: %clang --target=aarch64-none-linux-android21 -ccc-install-dir %t/bin \ +// RUN: -resource-dir %t/resource -### -c %s 2>&1 | \ +// RUN: FileCheck --check-prefix=ANDROID21 -DDIR=%t -DSEP=%{fs-sep} %s +// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]include[[SEP]]aarch64-none-linux-android +// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]lib[[SEP]]aarch64-none-linux-android +// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/resource[[SEP]]lib[[SEP]]aarch64-none-linux-android + +// 23 or newer should use the versioned directory +// RUN: %clang --target=aarch64-none-linux-android23 -ccc-install-dir %t/bin \ +// RUN: -resource-dir %t/resource -### -c %s 2>&1 | \ +// RUN: FileCheck --check-prefix=NO-WARNING %s + +// RUN: %clang --target=aarch64-none-linux-android28 -ccc-install-dir %t/bin \ +// RUN: -resource-dir %t/resource -### -c %s 2>&1 | \ +// RUN: FileCheck --check-prefix=NO-WARNING %s diff --git a/clang/test/Driver/linux-per-target-runtime-dir.c b/clang/test/Driver/linux-per-target-runtime-dir.c index c5c871236044b..8c721d82821f6 100644 --- a/clang/test/Driver/linux-per-target-runtime-dir.c +++ b/clang/test/Driver/linux-per-target-runtime-dir.c @@ -17,12 +17,25 @@ // RUN: %clang --target=aarch64-unknown-linux-android21 -print-file-name=libc++.so 2>&1 \ // RUN: -ccc-install-dir %S/Inputs/basic_android_libcxx_tree/usr/bin \ // RUN: | FileCheck --check-prefix=CHECK-LIBCXX-ANDROID21 %s -// CHECK-LIBCXX-ANDROID21: ..{{/|\\}}lib{{/|\\}}aarch64-unknown-linux-android21{{/|\\}}libc++.so +// CHECK-LIBCXX-ANDROID21: ..{{/|\\}}lib{{/|\\}}aarch64-unknown-linux-android{{/|\\}}libc++.so // RUN: %clang --target=aarch64-unknown-linux-android23 -print-file-name=libc++.so 2>&1 \ // RUN: -ccc-install-dir %S/Inputs/basic_android_libcxx_tree/usr/bin \ // RUN: | FileCheck --check-prefix=CHECK-LIBCXX-ANDROID23 %s -// CHECK-LIBCXX-ANDROID23: ..{{/|\\}}lib{{/|\\}}aarch64-unknown-linux-android{{/|\\}}libc++.so +// CHECK-LIBCXX-ANDROID23: ..{{/|\\}}lib{{/|\\}}aarch64-unknown-linux-android23{{/|\\}}libc++.so + +// RUN: %clang --target=aarch64-unknown-linux-android26 -print-file-name=libc++.so 2>&1 \ +// RUN: -ccc-install-dir %S/Inputs/basic_android_libcxx_tree/usr/bin \ +// RUN: | FileCheck --check-prefix=CHECK-LIBCXX-ANDROID23 %s + +// RUN: %clang --target=aarch64-unknown-linux-android29 -print-file-name=libc++.so 2>&1 \ +// RUN: -ccc-install-dir %S/Inputs/basic_android_libcxx_tree/usr/bin \ +// RUN: | FileCheck --check-prefix=CHECK-LIBCXX-ANDROID29 %s +// CHECK-LIBCXX-ANDROID29: ..{{/|\\}}lib{{/|\\}}aarch64-unknown-linux-android29{{/|\\}}libc++.so + +// RUN: %clang --target=aarch64-unknown-linux-android31 -print-file-name=libc++.so 2>&1 \ +// RUN: -ccc-install-dir %S/Inputs/basic_android_libcxx_tree/usr/bin \ +// RUN: | FileCheck --check-prefix=CHECK-LIBCXX-ANDROID29 %s // RUN: %clang --target=aarch64-unknown-linux-android -print-file-name=libc++.so 2>&1 \ // RUN: -ccc-install-dir %S/Inputs/basic_android_libcxx_tree/usr/bin \ @@ -45,13 +58,29 @@ // RUN: --target=aarch64-unknown-linux-android21 \ // RUN: -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \ // RUN: | FileCheck --check-prefix=CHECK-FILE-NAME-ANDROID21 %s -// CHECK-FILE-NAME-ANDROID21: lib{{/|\\}}aarch64-unknown-linux-android21{{/|\\}}libclang_rt.builtins.a +// CHECK-FILE-NAME-ANDROID21: lib{{/|\\}}aarch64-unknown-linux-android{{/|\\}}libclang_rt.builtins.a // RUN: %clang -rtlib=compiler-rt -print-file-name=libclang_rt.builtins.a 2>&1 \ // RUN: --target=aarch64-unknown-linux-android23 \ // RUN: -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \ // RUN: | FileCheck --check-prefix=CHECK-FILE-NAME-ANDROID23 %s -// CHECK-FILE-NAME-ANDROID23: lib{{/|\\}}aarch64-unknown-linux-android{{/|\\}}libclang_rt.builtins.a +// CHECK-FILE-NAME-ANDROID23: lib{{/|\\}}aarch64-unknown-linux-android23{{/|\\}}libclang_rt.builtins.a + +// RUN: %clang -rtlib=compiler-rt -print-file-name=libclang_rt.builtins.a 2>&1 \ +// RUN: --target=aarch64-unknown-linux-android26 \ +// RUN: -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \ +// RUN: | FileCheck --check-prefix=CHECK-FILE-NAME-ANDROID23 %s + +// RUN: %clang -rtlib=compiler-rt -print-file-name=libclang_rt.builtins.a 2>&1 \ +// RUN: --target=aarch64-unknown-linux-android29 \ +// RUN: -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \ +// RUN: | FileCheck --check-prefix=CHECK-FILE-NAME-ANDROID29 %s +// CHECK-FILE-NAME-ANDROID29: lib{{/|\\}}aarch64-unknown-linux-android29{{/|\\}}libclang_rt.builtins.a + +// RUN: %clang -rtlib=compiler-rt -print-file-name=libclang_rt.builtins.a 2>&1 \ +// RUN: --target=aarch64-unknown-linux-android31 \ +// RUN: -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \ +// RUN: | FileCheck --check-prefix=CHECK-FILE-NAME-ANDROID29 %s // RUN: %clang -rtlib=compiler-rt -print-file-name=libclang_rt.builtins.a 2>&1 \ // RUN: --target=aarch64-unknown-linux-android \ diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c index bd8488d81c0b1..3eaceedce685f 100644 --- a/clang/test/Driver/riscv-cpus.c +++ b/clang/test/Driver/riscv-cpus.c @@ -37,6 +37,31 @@ // RUN: %clang --target=riscv32 -### -c %s 2>&1 -mtune=syntacore-scr1-max | FileCheck -check-prefix=MTUNE-SYNTACORE-SCR1-MAX %s // MTUNE-SYNTACORE-SCR1-MAX: "-tune-cpu" "syntacore-scr1-max" +// RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=veyron-v1 | FileCheck -check-prefix=MCPU-VEYRON-V1 %s +// MCPU-VEYRON-V1: "-target-cpu" "veyron-v1" +// MCPU-VEYRON-V1: "-target-feature" "+m" +// MCPU-VEYRON-V1: "-target-feature" "+a" +// MCPU-VEYRON-V1: "-target-feature" "+f" +// MCPU-VEYRON-V1: "-target-feature" "+d" +// MCPU-VEYRON-V1: "-target-feature" "+c" +// MCPU-VEYRON-V1: "-target-feature" "+zicbom" +// MCPU-VEYRON-V1: "-target-feature" "+zicbop" +// MCPU-VEYRON-V1: "-target-feature" "+zicboz" +// MCPU-VEYRON-V1: "-target-feature" "+zicntr" +// MCPU-VEYRON-V1: "-target-feature" "+zicsr" +// MCPU-VEYRON-V1: "-target-feature" "+zifencei" +// MCPU-VEYRON-V1: "-target-feature" "+zihintpause" +// MCPU-VEYRON-V1: "-target-feature" "+zihpm" +// MCPU-VEYRON-V1: "-target-feature" "+zba" +// MCPU-VEYRON-V1: "-target-feature" "+zbb" +// MCPU-VEYRON-V1: "-target-feature" "+zbc" +// MCPU-VEYRON-V1: "-target-feature" "+zbs" +// MCPU-VEYRON-V1: "-target-feature" "+xventanacondops" +// MCPU-VEYRON-V1: "-target-abi" "lp64d" + +// RUN: %clang --target=riscv64 -### -c %s 2>&1 -mtune=veyron-v1 | FileCheck -check-prefix=MTUNE-VEYRON-V1 %s +// MTUNE-VEYRON-V1: "-tune-cpu" "veyron-v1" + // Check mtune alias CPU has resolved to the right CPU according XLEN. // RUN: %clang --target=riscv32 -### -c %s 2>&1 -mtune=generic | FileCheck -check-prefix=MTUNE-GENERIC-32 %s // MTUNE-GENERIC-32: "-tune-cpu" "generic" diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index c44bd6087af41..a639b0ddde85e 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -85,7 +85,7 @@ // RUN: not %clang_cc1 -triple riscv64 -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix RISCV64 // RISCV64: error: unknown target CPU 'not-a-cpu' -// RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280{{$}} +// RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, veyron-v1{{$}} // RUN: not %clang_cc1 -triple riscv32 -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE-RISCV32 // TUNE-RISCV32: error: unknown target CPU 'not-a-cpu' @@ -93,4 +93,4 @@ // RUN: not %clang_cc1 -triple riscv64 -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE-RISCV64 // TUNE-RISCV64: error: unknown target CPU 'not-a-cpu' -// TUNE-RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, generic, rocket, sifive-7-series{{$}} +// TUNE-RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, veyron-v1, generic, rocket, sifive-7-series{{$}} diff --git a/clang/test/Parser/gh48527.cpp b/clang/test/Parser/gh48527.cpp new file mode 100644 index 0000000000000..420c35be37f51 --- /dev/null +++ b/clang/test/Parser/gh48527.cpp @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +int main() { // expected-note {{to match this '{'}} + auto a = [](void)__attribute__((b(({ // expected-note {{to match this '('}} + return 0; +} // expected-error 3 {{expected ')'}} \ + // expected-error {{expected ';' at end of declaration}} +// expected-error@+2 {{expected ')'}} +// expected-error@+1 {{expected body of lambda expression}} +// expected-error {{expected '}'}} diff --git a/clang/test/Sema/warn-thread-safety-analysis.c b/clang/test/Sema/warn-thread-safety-analysis.c index 355616b73d967..642ea88ec3c96 100644 --- a/clang/test/Sema/warn-thread-safety-analysis.c +++ b/clang/test/Sema/warn-thread-safety-analysis.c @@ -72,6 +72,8 @@ int get_value(int *p) SHARED_LOCKS_REQUIRED(foo_.mu_){ return *p; } +void unlock_scope(struct Mutex *const *mu) __attribute__((release_capability(**mu))); + int main(void) { Foo_fun1(1); // expected-warning{{calling function 'Foo_fun1' requires holding mutex 'mu2'}} \ @@ -127,6 +129,13 @@ int main(void) { // expected-note@-1{{mutex released here}} mutex_shared_unlock(&mu1); // expected-warning {{releasing mutex 'mu1' that was not held}} + /// Cleanup functions + { + struct Mutex* const __attribute__((cleanup(unlock_scope))) scope = &mu1; + mutex_exclusive_lock(scope); // Note that we have to lock through scope, because no alias analysis! + // Cleanup happens automatically -> no warning. + } + return 0; } diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp index def3bbb8adf0f..c0adbbdf9be63 100644 --- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp +++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp @@ -330,3 +330,26 @@ struct S { S s(0); // expected-note {{in the default initializer of 'j'}} } + +namespace GH66324 { + +consteval int allocate(); // expected-note 2{{declared here}} + +struct _Vector_base { + int b = allocate(); // expected-note 2{{undefined function 'allocate' cannot be used in a constant expression}} \ + // expected-error {{call to consteval function 'GH66324::allocate' is not a constant expression}} \ + // expected-note {{declared here}} +}; + +template +struct vector : _Vector_base { + constexpr vector() + // expected-note@-1 {{'vector' is an immediate constructor because its body contains a call to a consteval function 'allocate' and that call is not a constant expression}} + : _Vector_base{} {} // expected-note {{in the default initializer of 'b'}} +}; + +vector v{}; +// expected-error@-1 {{call to immediate function 'GH66324::vector::vector' is not a constant expression}} +// expected-note@-2 {{in call to 'vector()'}} + +} diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp index 23745dc141547..0c9e8584e6534 100644 --- a/clang/test/SemaCXX/lambda-expressions.cpp +++ b/clang/test/SemaCXX/lambda-expressions.cpp @@ -714,3 +714,7 @@ void foo() { // CHECK-NEXT: ConstantExpr // CHECK-NEXT: value: Int 2 } + +void GH48527() { + auto a = []()__attribute__((b(({ return 0; })))){}; // expected-warning {{unknown attribute 'b' ignored}} +} diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp index 891b45aa57892..68050e0f09e24 100644 --- a/clang/test/SemaTemplate/concepts.cpp +++ b/clang/test/SemaTemplate/concepts.cpp @@ -1031,3 +1031,20 @@ void test() { fff(42UL); // expected-error {{no matching function}} } } + +namespace GH66612 { + template + auto end(C c) ->int; + + template + concept Iterator = true; + + template + concept Container = requires(CT b) { + { end } -> Iterator; // #66612GH_END + }; + + static_assert(Container);// expected-error{{static assertion failed}} + // expected-note@-1{{because 'int' does not satisfy 'Container'}} + // expected-note@#66612GH_END{{because 'end' would be invalid: reference to overloaded function could not be resolved; did you mean to call it?}} +} diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index 14188f5acd5b3..e8cbca7564603 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -1463,6 +1463,7 @@ TEST(TransferTest, StructModeledFieldsWithAccessor) { int getIntNotAccessed() const { return IntNotAccessed; } int getIntNoDefinition() const; int &getIntRef() { return IntRef; } + void returnVoid() const { return; } }; void target() { @@ -1473,6 +1474,14 @@ TEST(TransferTest, StructModeledFieldsWithAccessor) { int i2 = s.getWithInc(1); int i3 = s.getIntNoDefinition(); int &iref = s.getIntRef(); + + // Regression test: Don't crash on an indirect call (which doesn't have + // an associated `CXXMethodDecl`). + auto ptr_to_member_fn = &S::getPtr; + p1 = (s.*ptr_to_member_fn)(); + + // Regression test: Don't crash on a return statement without a value. + s.returnVoid(); // [[p]] } )"; diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp index d318e26c6a868..6272e7116846c 100644 --- a/compiler-rt/lib/hwasan/hwasan_report.cpp +++ b/compiler-rt/lib/hwasan/hwasan_report.cpp @@ -37,7 +37,7 @@ namespace __hwasan { class ScopedReport { public: - ScopedReport(bool fatal = false) : fatal(fatal) { + explicit ScopedReport(bool fatal) : fatal(fatal) { Lock lock(&error_message_lock_); error_message_ptr_ = fatal ? &error_message_ : nullptr; ++hwasan_report_count; @@ -108,29 +108,45 @@ static void MaybePrintAndroidHelpUrl() { #endif } +namespace { // A RAII object that holds a copy of the current thread stack ring buffer. // The actual stack buffer may change while we are iterating over it (for // example, Printf may call syslog() which can itself be built with hwasan). class SavedStackAllocations { public: - SavedStackAllocations(StackAllocationsRingBuffer *rb) { + SavedStackAllocations() = default; + + explicit SavedStackAllocations(Thread *t) { CopyFrom(t); } + + void CopyFrom(Thread *t) { + StackAllocationsRingBuffer *rb = t->stack_allocations(); uptr size = rb->size() * sizeof(uptr); void *storage = MmapAlignedOrDieOnFatalError(size, size * 2, "saved stack allocations"); new (&rb_) StackAllocationsRingBuffer(*rb, storage); + thread_id_ = t->unique_id(); } ~SavedStackAllocations() { - StackAllocationsRingBuffer *rb = get(); - UnmapOrDie(rb->StartOfStorage(), rb->size() * sizeof(uptr)); + if (rb_) { + StackAllocationsRingBuffer *rb = get(); + UnmapOrDie(rb->StartOfStorage(), rb->size() * sizeof(uptr)); + } + } + + const StackAllocationsRingBuffer *get() const { + return (const StackAllocationsRingBuffer *)&rb_; } StackAllocationsRingBuffer *get() { return (StackAllocationsRingBuffer *)&rb_; } + u32 thread_id() const { return thread_id_; } + private: - uptr rb_; + uptr rb_ = 0; + u32 thread_id_; }; class Decorator: public __sanitizer::SanitizerCommonDecorator { @@ -143,6 +159,7 @@ class Decorator: public __sanitizer::SanitizerCommonDecorator { const char *Location() { return Green(); } const char *Thread() { return Green(); } }; +} // namespace static bool FindHeapAllocation(HeapAllocationsRingBuffer *rb, uptr tagged_addr, HeapAllocationRecord *har, uptr *ring_index, @@ -183,7 +200,7 @@ static bool FindHeapAllocation(HeapAllocationsRingBuffer *rb, uptr tagged_addr, return false; } -static void PrintStackAllocations(StackAllocationsRingBuffer *sa, +static void PrintStackAllocations(const StackAllocationsRingBuffer *sa, tag_t addr_tag, uptr untagged_addr) { uptr frames = Min((uptr)flags()->stack_history_size, sa->size()); bool found_local = false; @@ -373,12 +390,117 @@ static void ShowHeapOrGlobalCandidate(uptr untagged_addr, tag_t *candidate, } } -void PrintAddressDescription( - uptr tagged_addr, uptr access_size, - StackAllocationsRingBuffer *current_stack_allocations) { +void ReportStats() {} + +static void PrintTagInfoAroundAddr(tag_t *tag_ptr, uptr num_rows, + void (*print_tag)(InternalScopedString &s, + tag_t *tag)) { + const uptr row_len = 16; // better be power of two. + tag_t *center_row_beg = reinterpret_cast( + RoundDownTo(reinterpret_cast(tag_ptr), row_len)); + tag_t *beg_row = center_row_beg - row_len * (num_rows / 2); + tag_t *end_row = center_row_beg + row_len * ((num_rows + 1) / 2); + InternalScopedString s; + for (tag_t *row = beg_row; row < end_row; row += row_len) { + s.Append(row == center_row_beg ? "=>" : " "); + s.AppendF("%p:", (void *)ShadowToMem(reinterpret_cast(row))); + for (uptr i = 0; i < row_len; i++) { + s.Append(row + i == tag_ptr ? "[" : " "); + print_tag(s, &row[i]); + s.Append(row + i == tag_ptr ? "]" : " "); + } + s.AppendF("\n"); + } + Printf("%s", s.data()); +} + +static void PrintTagsAroundAddr(tag_t *tag_ptr) { + Printf( + "Memory tags around the buggy address (one tag corresponds to %zd " + "bytes):\n", + kShadowAlignment); + PrintTagInfoAroundAddr(tag_ptr, 17, [](InternalScopedString &s, tag_t *tag) { + s.AppendF("%02x", *tag); + }); + + Printf( + "Tags for short granules around the buggy address (one tag corresponds " + "to %zd bytes):\n", + kShadowAlignment); + PrintTagInfoAroundAddr(tag_ptr, 3, [](InternalScopedString &s, tag_t *tag) { + if (*tag >= 1 && *tag <= kShadowAlignment) { + uptr granule_addr = ShadowToMem(reinterpret_cast(tag)); + s.AppendF("%02x", + *reinterpret_cast(granule_addr + kShadowAlignment - 1)); + } else { + s.AppendF(".."); + } + }); + Printf( + "See " + "https://clang.llvm.org/docs/" + "HardwareAssistedAddressSanitizerDesign.html#short-granules for a " + "description of short granule tags\n"); +} + +static uptr GetTopPc(StackTrace *stack) { + return stack->size ? StackTrace::GetPreviousInstructionPc(stack->trace[0]) + : 0; +} + +namespace { +class BaseReport { + public: + BaseReport(StackTrace *stack, bool fatal, uptr tagged_addr, uptr access_size) + : scoped_report(fatal), + stack(stack), + tagged_addr(tagged_addr), + access_size(access_size), + untagged_addr(UntagAddr(tagged_addr)), + ptr_tag(GetTagFromPointer(tagged_addr)) { + if (MemIsShadow(untagged_addr)) + return; + + HwasanChunkView chunk = FindHeapChunkByAddress(untagged_addr); + heap.begin = chunk.Beg(); + if (heap.begin) { + heap.size = chunk.ActualSize(); + heap.from_small_heap = chunk.FromSmallHeap(); + heap.is_allocated = chunk.IsAllocated(); + } + + hwasanThreadList().VisitAllLiveThreads([&](Thread *t) { + if (stack_allocations_count < ARRAY_SIZE(stack_allocations) && + t->AddrIsInStack(untagged_addr)) { + stack_allocations[stack_allocations_count++].CopyFrom(t); + } + }); + } + + protected: + void PrintAddressDescription() const; + + ScopedReport scoped_report; + StackTrace *stack = nullptr; + uptr tagged_addr = 0; + uptr access_size = 0; + uptr untagged_addr = 0; + tag_t ptr_tag = 0; + + uptr stack_allocations_count = 0; + SavedStackAllocations stack_allocations[16]; + + struct { + uptr begin = 0; + uptr size = 0; + bool from_small_heap = false; + bool is_allocated = false; + } heap; +}; + +void BaseReport::PrintAddressDescription() const { Decorator d; int num_descriptions_printed = 0; - uptr untagged_addr = UntagAddr(tagged_addr); if (MemIsShadow(untagged_addr)) { Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(), untagged_addr, @@ -387,61 +509,52 @@ void PrintAddressDescription( } // Print some very basic information about the address, if it's a heap. - HwasanChunkView chunk = FindHeapChunkByAddress(untagged_addr); - if (uptr beg = chunk.Beg()) { - uptr size = chunk.ActualSize(); - Printf("%s[%p,%p) is a %s %s heap chunk; " - "size: %zd offset: %zd\n%s", - d.Location(), - beg, beg + size, - chunk.FromSmallHeap() ? "small" : "large", - chunk.IsAllocated() ? "allocated" : "unallocated", - size, untagged_addr - beg, - d.Default()); + if (heap.begin) { + Printf( + "%s[%p,%p) is a %s %s heap chunk; " + "size: %zd offset: %zd\n%s", + d.Location(), heap.begin, heap.begin + heap.size, + heap.from_small_heap ? "small" : "large", + heap.is_allocated ? "allocated" : "unallocated", heap.size, + untagged_addr - heap.begin, d.Default()); } - tag_t addr_tag = GetTagFromPointer(tagged_addr); - - bool on_stack = false; // Check stack first. If the address is on the stack of a live thread, we // know it cannot be a heap / global overflow. - hwasanThreadList().VisitAllLiveThreads([&](Thread *t) { - if (t->AddrIsInStack(untagged_addr)) { - on_stack = true; - // TODO(fmayer): figure out how to distinguish use-after-return and - // stack-buffer-overflow. - Printf("%s", d.Error()); - Printf("\nCause: stack tag-mismatch\n"); - Printf("%s", d.Location()); - Printf("Address %p is located in stack of thread T%zd\n", untagged_addr, - t->unique_id()); - Printf("%s", d.Default()); - t->Announce(); + for (uptr i = 0; i < stack_allocations_count; ++i) { + auto &allocations = stack_allocations[i]; + // TODO(fmayer): figure out how to distinguish use-after-return and + // stack-buffer-overflow. + Printf("%s", d.Error()); + Printf("\nCause: stack tag-mismatch\n"); + Printf("%s", d.Location()); + Printf("Address %p is located in stack of thread T%zd\n", untagged_addr, + allocations.thread_id()); + Printf("%s", d.Default()); + hwasanThreadList().VisitAllLiveThreads([&](Thread *t) { + if (allocations.thread_id() == t->unique_id()) + t->Announce(); + }); - auto *sa = (t == GetCurrentThread() && current_stack_allocations) - ? current_stack_allocations - : t->stack_allocations(); - PrintStackAllocations(sa, addr_tag, untagged_addr); - num_descriptions_printed++; - } - }); + PrintStackAllocations(allocations.get(), ptr_tag, untagged_addr); + num_descriptions_printed++; + } // Check if this looks like a heap buffer overflow by scanning // the shadow left and right and looking for the first adjacent - // object with a different memory tag. If that tag matches addr_tag, + // object with a different memory tag. If that tag matches ptr_tag, // check the allocator if it has a live chunk there. tag_t *tag_ptr = reinterpret_cast(MemToShadow(untagged_addr)); tag_t *candidate = nullptr, *left = tag_ptr, *right = tag_ptr; uptr candidate_distance = 0; for (; candidate_distance < 1000; candidate_distance++) { - if (MemIsShadow(reinterpret_cast(left)) && - TagsEqual(addr_tag, left)) { + if (MemIsShadow(reinterpret_cast(left)) && TagsEqual(ptr_tag, left)) { candidate = left; break; } --left; if (MemIsShadow(reinterpret_cast(right)) && - TagsEqual(addr_tag, right)) { + TagsEqual(ptr_tag, right)) { candidate = right; break; } @@ -450,7 +563,8 @@ void PrintAddressDescription( constexpr auto kCloseCandidateDistance = 1; - if (!on_stack && candidate && candidate_distance <= kCloseCandidateDistance) { + if (!stack_allocations_count && candidate && + candidate_distance <= kCloseCandidateDistance) { ShowHeapOrGlobalCandidate(untagged_addr, candidate, left, right); num_descriptions_printed++; } @@ -512,68 +626,16 @@ void PrintAddressDescription( } } -void ReportStats() {} - -static void PrintTagInfoAroundAddr(tag_t *tag_ptr, uptr num_rows, - void (*print_tag)(InternalScopedString &s, - tag_t *tag)) { - const uptr row_len = 16; // better be power of two. - tag_t *center_row_beg = reinterpret_cast( - RoundDownTo(reinterpret_cast(tag_ptr), row_len)); - tag_t *beg_row = center_row_beg - row_len * (num_rows / 2); - tag_t *end_row = center_row_beg + row_len * ((num_rows + 1) / 2); - InternalScopedString s; - for (tag_t *row = beg_row; row < end_row; row += row_len) { - s.Append(row == center_row_beg ? "=>" : " "); - s.AppendF("%p:", (void *)ShadowToMem(reinterpret_cast(row))); - for (uptr i = 0; i < row_len; i++) { - s.Append(row + i == tag_ptr ? "[" : " "); - print_tag(s, &row[i]); - s.Append(row + i == tag_ptr ? "]" : " "); - } - s.AppendF("\n"); - } - Printf("%s", s.data()); -} - -static void PrintTagsAroundAddr(tag_t *tag_ptr) { - Printf( - "Memory tags around the buggy address (one tag corresponds to %zd " - "bytes):\n", kShadowAlignment); - PrintTagInfoAroundAddr(tag_ptr, 17, [](InternalScopedString &s, tag_t *tag) { - s.AppendF("%02x", *tag); - }); - - Printf( - "Tags for short granules around the buggy address (one tag corresponds " - "to %zd bytes):\n", - kShadowAlignment); - PrintTagInfoAroundAddr(tag_ptr, 3, [](InternalScopedString &s, tag_t *tag) { - if (*tag >= 1 && *tag <= kShadowAlignment) { - uptr granule_addr = ShadowToMem(reinterpret_cast(tag)); - s.AppendF("%02x", - *reinterpret_cast(granule_addr + kShadowAlignment - 1)); - } else { - s.AppendF(".."); - } - }); - Printf( - "See " - "https://clang.llvm.org/docs/" - "HardwareAssistedAddressSanitizerDesign.html#short-granules for a " - "description of short granule tags\n"); -} - -uptr GetTopPc(StackTrace *stack) { - return stack->size ? StackTrace::GetPreviousInstructionPc(stack->trace[0]) - : 0; -} +class InvalidFreeReport : public BaseReport { + public: + InvalidFreeReport(StackTrace *stack, uptr tagged_addr) + : BaseReport(stack, flags()->halt_on_error, tagged_addr, 0) {} + ~InvalidFreeReport(); -void ReportInvalidFree(StackTrace *stack, uptr tagged_addr) { - ScopedReport R(flags()->halt_on_error); + private: +}; - uptr untagged_addr = UntagAddr(tagged_addr); - tag_t ptr_tag = GetTagFromPointer(tagged_addr); +InvalidFreeReport::~InvalidFreeReport() { tag_t *tag_ptr = nullptr; tag_t mem_tag = 0; if (MemIsApp(untagged_addr)) { @@ -602,7 +664,7 @@ void ReportInvalidFree(StackTrace *stack, uptr tagged_addr) { stack->Print(); - PrintAddressDescription(tagged_addr, 0, nullptr); + PrintAddressDescription(); if (tag_ptr) PrintTagsAroundAddr(tag_ptr); @@ -611,21 +673,31 @@ void ReportInvalidFree(StackTrace *stack, uptr tagged_addr) { ReportErrorSummary(bug_type, stack); } -void ReportTailOverwritten(StackTrace *stack, uptr tagged_addr, uptr orig_size, - const u8 *expected) { +class TailOverwrittenReport : public BaseReport { + public: + explicit TailOverwrittenReport(StackTrace *stack, uptr tagged_addr, + uptr orig_size, const u8 *expected) + : BaseReport(stack, flags()->halt_on_error, tagged_addr, 0), + orig_size(orig_size), + expected(expected) {} + ~TailOverwrittenReport(); + + private: + uptr orig_size; + const u8 *expected; +}; + +TailOverwrittenReport::~TailOverwrittenReport() { uptr tail_size = kShadowAlignment - (orig_size % kShadowAlignment); u8 actual_expected[kShadowAlignment]; internal_memcpy(actual_expected, expected, tail_size); - tag_t ptr_tag = GetTagFromPointer(tagged_addr); // Short granule is stashed in the last byte of the magic string. To avoid // confusion, make the expected magic string contain the short granule tag. if (orig_size % kShadowAlignment != 0) { actual_expected[tail_size - 1] = ptr_tag; } - ScopedReport R(flags()->halt_on_error); Decorator d; - uptr untagged_addr = UntagAddr(tagged_addr); Printf("%s", d.Error()); const char *bug_type = "allocation-tail-overwritten"; Report("ERROR: %s: %s; heap object [%p,%p) of size %zd\n", SanitizerToolName, @@ -683,14 +755,25 @@ void ReportTailOverwritten(StackTrace *stack, uptr tagged_addr, uptr orig_size, ReportErrorSummary(bug_type, stack); } -void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size, - bool is_store, bool fatal, uptr *registers_frame) { - ScopedReport R(fatal); - SavedStackAllocations current_stack_allocations( - GetCurrentThread()->stack_allocations()); +class TagMismatchReport : public BaseReport { + public: + explicit TagMismatchReport(StackTrace *stack, uptr tagged_addr, + uptr access_size, bool is_store, bool fatal, + uptr *registers_frame) + : BaseReport(stack, fatal, tagged_addr, access_size), + access_size(access_size), + is_store(is_store), + registers_frame(registers_frame) {} + ~TagMismatchReport(); + private: + uptr access_size; + bool is_store; + uptr *registers_frame; +}; + +TagMismatchReport::~TagMismatchReport() { Decorator d; - uptr untagged_addr = UntagAddr(tagged_addr); // TODO: when possible, try to print heap-use-after-free, etc. const char *bug_type = "tag-mismatch"; uptr pc = GetTopPc(stack); @@ -704,7 +787,6 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size, __hwasan_test_shadow(reinterpret_cast(tagged_addr), access_size); CHECK_GE(offset, 0); CHECK_LT(offset, static_cast(access_size)); - tag_t ptr_tag = GetTagFromPointer(tagged_addr); tag_t *tag_ptr = reinterpret_cast(MemToShadow(untagged_addr + offset)); tag_t mem_tag = *tag_ptr; @@ -741,8 +823,7 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size, stack->Print(); - PrintAddressDescription(tagged_addr, access_size, - current_stack_allocations.get()); + PrintAddressDescription(); t->Announce(); PrintTagsAroundAddr(tag_ptr); @@ -753,6 +834,22 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size, MaybePrintAndroidHelpUrl(); ReportErrorSummary(bug_type, stack); } +} // namespace + +void ReportInvalidFree(StackTrace *stack, uptr tagged_addr) { + InvalidFreeReport R(stack, tagged_addr); +} + +void ReportTailOverwritten(StackTrace *stack, uptr tagged_addr, uptr orig_size, + const u8 *expected) { + TailOverwrittenReport R(stack, tagged_addr, orig_size, expected); +} + +void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size, + bool is_store, bool fatal, uptr *registers_frame) { + TagMismatchReport R(stack, tagged_addr, access_size, is_store, fatal, + registers_frame); +} // See the frame breakdown defined in __hwasan_tag_mismatch (from // hwasan_tag_mismatch_{aarch64,riscv64}.S). diff --git a/compiler-rt/test/asan/TestCases/printf-5.c b/compiler-rt/test/asan/TestCases/printf-5.c index eef5223e4e8da..19ff182acce67 100644 --- a/compiler-rt/test/asan/TestCases/printf-5.c +++ b/compiler-rt/test/asan/TestCases/printf-5.c @@ -1,4 +1,4 @@ -// RUN: %clang_asan -O2 %s -o %t +// RUN: %clang_asan -Wno-excess-initializers -O2 %s -o %t // We need replace_intrin=0 to avoid reporting errors in memcpy. // RUN: %env_asan_opts=replace_intrin=0:check_printf=1 not %run %t 2>&1 | FileCheck --check-prefix=CHECK-ON %s // RUN: %env_asan_opts=replace_intrin=0:check_printf=0 %run %t 2>&1 | FileCheck --check-prefix=CHECK-OFF %s diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py index 5afefb1142fc7..29d7867e80867 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/command/ParseCommand.py @@ -13,7 +13,7 @@ import unittest from copy import copy from pathlib import PurePath -from collections import defaultdict, OrderedDict +from collections import defaultdict, OrderedDict, namedtuple from dex.utils.Exceptions import CommandParseError, NonFloatValueInCommand @@ -83,7 +83,7 @@ def _merge_subcommands(command_name: str, valid_commands: dict) -> dict: def _build_command( - command_type, labels, addresses, raw_text: str, path: str, lineno: str + command_type, labels, addresses, raw_text: str, path, lineno: str ) -> CommandBase: """Build a command object from raw text. @@ -100,11 +100,13 @@ def label_to_line(label_name: str) -> int: line = labels.get(label_name, None) if line != None: return line - raise format_unresolved_label_err(label_name, raw_text, path, lineno) + raise format_unresolved_label_err(label_name, raw_text, path.base, lineno) def get_address_object(address_name: str, offset: int = 0): if address_name not in addresses: - raise format_undeclared_address_err(address_name, raw_text, path, lineno) + raise format_undeclared_address_err( + address_name, raw_text, path.base, lineno + ) return AddressExpression(address_name, offset) valid_commands = _merge_subcommands( @@ -120,7 +122,7 @@ def get_address_object(address_name: str, offset: int = 0): command = eval(raw_text, valid_commands) # pylint: enable=eval-used command.raw_text = raw_text - command.path = path + command.path = path.declared command.lineno = lineno return command @@ -267,7 +269,8 @@ def _find_all_commands_in_file(path, file_lines, valid_commands, source_root_dir labels = {} # dict of {name: line}. addresses = [] # list of addresses. address_resolutions = {} - cmd_path = path + CmdPath = namedtuple("cmd_path", "base declared") + cmd_path = CmdPath(path, path) declared_files = set() commands = defaultdict(dict) paren_balance = 0 @@ -346,17 +349,16 @@ def _find_all_commands_in_file(path, file_lines, valid_commands, source_root_dir elif type(command) is DexDeclareAddress: add_address(addresses, command, path, cmd_point.get_lineno()) elif type(command) is DexDeclareFile: - cmd_path = command.declared_file - if not os.path.isabs(cmd_path): + declared_path = command.declared_file + if not os.path.isabs(declared_path): source_dir = ( source_root_dir if source_root_dir else os.path.dirname(path) ) - cmd_path = os.path.join(source_dir, cmd_path) - # TODO: keep stored paths as PurePaths for 'longer'. - cmd_path = str(PurePath(cmd_path)) - declared_files.add(cmd_path) + declared_path = os.path.join(source_dir, declared_path) + cmd_path = CmdPath(cmd_path.base, str(PurePath(declared_path))) + declared_files.add(cmd_path.declared) elif type(command) is DexCommandLine and "DexCommandLine" in commands: msg = "More than one DexCommandLine in file" raise format_parse_err(msg, path, file_lines, err_point) diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_dexdeclarefile.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_dexdeclarefile.cpp new file mode 100644 index 0000000000000..e3f08af204e76 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_dexdeclarefile.cpp @@ -0,0 +1,14 @@ +// Purpose: +// Check that Dexter command syntax errors associate with the line and file +// they appeared in rather than the current declared file. +// +// RUN: %dexter_regression_test_build %s -o %t +// RUN: not %dexter_base test --binary %t --debugger 'lldb' -v -- %s \ +// RUN: | FileCheck %s --implicit-check-not=FAIL-FILENAME-MATCH + +// CHECK: err_syntax_dexdeclarefile.cpp(14): Undeclared address: 'not_been_declared' + +int main() { return 0; } + +// DexDeclareFile('FAIL-FILENAME-MATCH') +// DexExpectWatchValue('example', address('not_been_declared')) diff --git a/flang/include/flang/Lower/OpenMP.h b/flang/include/flang/Lower/OpenMP.h index 3563de7d091e8..c9162761a08d5 100644 --- a/flang/include/flang/Lower/OpenMP.h +++ b/flang/include/flang/Lower/OpenMP.h @@ -36,6 +36,7 @@ struct OmpClauseList; namespace semantics { class Symbol; +class SemanticsContext; } // namespace semantics namespace lower { @@ -51,8 +52,8 @@ struct Variable; void genOpenMPTerminator(fir::FirOpBuilder &, mlir::Operation *, mlir::Location); -void genOpenMPConstruct(AbstractConverter &, pft::Evaluation &, - const parser::OpenMPConstruct &); +void genOpenMPConstruct(AbstractConverter &, semantics::SemanticsContext &, + pft::Evaluation &, const parser::OpenMPConstruct &); void genOpenMPDeclarativeConstruct(AbstractConverter &, pft::Evaluation &, const parser::OpenMPDeclarativeConstruct &); int64_t getCollapseValue(const Fortran::parser::OmpClauseList &clauseList); diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index a3329ad5d3065..d068fc447bcfc 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2320,7 +2320,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { void genFIR(const Fortran::parser::OpenMPConstruct &omp) { mlir::OpBuilder::InsertPoint insertPt = builder->saveInsertionPoint(); localSymbols.pushScope(); - genOpenMPConstruct(*this, getEval(), omp); + genOpenMPConstruct(*this, bridge.getSemanticsContext(), getEval(), omp); const Fortran::parser::OpenMPLoopConstruct *ompLoop = std::get_if(&omp.u); diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h index efac311bec833..662897c9e6c01 100644 --- a/flang/lib/Lower/DirectivesCommon.h +++ b/flang/lib/Lower/DirectivesCommon.h @@ -12,13 +12,15 @@ /// /// A location to place directive utilities shared across multiple lowering /// files, e.g. utilities shared in OpenMP and OpenACC. The header file can -/// be used for both declarations and templated/inline implementations. +/// be used for both declarations and templated/inline implementations //===----------------------------------------------------------------------===// #ifndef FORTRAN_LOWER_DIRECTIVES_COMMON_H #define FORTRAN_LOWER_DIRECTIVES_COMMON_H #include "flang/Common/idioms.h" +#include "flang/Evaluate/tools.h" +#include "flang/Lower/AbstractConverter.h" #include "flang/Lower/Bridge.h" #include "flang/Lower/ConvertExpr.h" #include "flang/Lower/ConvertVariable.h" @@ -26,6 +28,7 @@ #include "flang/Lower/OpenMP.h" #include "flang/Lower/PFTBuilder.h" #include "flang/Lower/StatementContext.h" +#include "flang/Lower/Support/Utils.h" #include "flang/Optimizer/Builder/BoxValue.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Todo.h" @@ -36,7 +39,9 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/Value.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" +#include #include namespace Fortran { @@ -611,6 +616,309 @@ void createEmptyRegionBlocks( } } +inline mlir::Value +getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter, + fir::FirOpBuilder &builder, + Fortran::lower::SymbolRef sym, mlir::Location loc) { + mlir::Value symAddr = converter.getSymbolAddress(sym); + // TODO: Might need revisiting to handle for non-shared clauses + if (!symAddr) { + if (const auto *details = + sym->detailsIf()) + symAddr = converter.getSymbolAddress(details->symbol()); + } + + if (!symAddr) + llvm::report_fatal_error("could not retrieve symbol address"); + + if (auto boxTy = + fir::unwrapRefType(symAddr.getType()).dyn_cast()) { + if (boxTy.getEleTy().isa()) + TODO(loc, "derived type"); + + // Load the box when baseAddr is a `fir.ref>` or a + // `fir.ref>` type. + if (symAddr.getType().isa()) + return builder.create(loc, symAddr); + } + return symAddr; +} + +/// Generate the bounds operation from the descriptor information. +template +llvm::SmallVector +genBoundsOpsFromBox(fir::FirOpBuilder &builder, mlir::Location loc, + Fortran::lower::AbstractConverter &converter, + fir::ExtendedValue dataExv, mlir::Value box) { + llvm::SmallVector bounds; + mlir::Type idxTy = builder.getIndexType(); + mlir::Type boundTy = builder.getType(); + mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); + assert(box.getType().isa() && + "expect fir.box or fir.class"); + for (unsigned dim = 0; dim < dataExv.rank(); ++dim) { + mlir::Value d = builder.createIntegerConstant(loc, idxTy, dim); + mlir::Value baseLb = + fir::factory::readLowerBound(builder, loc, dataExv, dim, one); + auto dimInfo = + builder.create(loc, idxTy, idxTy, idxTy, box, d); + mlir::Value lb = builder.createIntegerConstant(loc, idxTy, 0); + mlir::Value ub = + builder.create(loc, dimInfo.getExtent(), one); + mlir::Value bound = + builder.create(loc, boundTy, lb, ub, mlir::Value(), + dimInfo.getByteStride(), true, baseLb); + bounds.push_back(bound); + } + return bounds; +} + +/// Generate bounds operation for base array without any subscripts +/// provided. +template +llvm::SmallVector +genBaseBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc, + Fortran::lower::AbstractConverter &converter, + fir::ExtendedValue dataExv, mlir::Value baseAddr) { + mlir::Type idxTy = builder.getIndexType(); + mlir::Type boundTy = builder.getType(); + llvm::SmallVector bounds; + + if (dataExv.rank() == 0) + return bounds; + + mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); + for (std::size_t dim = 0; dim < dataExv.rank(); ++dim) { + mlir::Value baseLb = + fir::factory::readLowerBound(builder, loc, dataExv, dim, one); + mlir::Value ext = fir::factory::readExtent(builder, loc, dataExv, dim); + mlir::Value lb = builder.createIntegerConstant(loc, idxTy, 0); + + // ub = extent - 1 + mlir::Value ub = builder.create(loc, ext, one); + mlir::Value bound = + builder.create(loc, boundTy, lb, ub, ext, one, false, baseLb); + bounds.push_back(bound); + } + return bounds; +} + +/// Generate bounds operations for an array section when subscripts are +/// provided. +template +llvm::SmallVector +genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc, + Fortran::lower::AbstractConverter &converter, + Fortran::lower::StatementContext &stmtCtx, + const std::list &subscripts, + std::stringstream &asFortran, fir::ExtendedValue &dataExv, + mlir::Value baseAddr) { + int dimension = 0; + mlir::Type idxTy = builder.getIndexType(); + mlir::Type boundTy = builder.getType(); + llvm::SmallVector bounds; + + mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); + mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); + for (const auto &subscript : subscripts) { + if (const auto *triplet{ + std::get_if(&subscript.u)}) { + if (dimension != 0) + asFortran << ','; + mlir::Value lbound, ubound, extent; + std::optional lval, uval; + mlir::Value baseLb = + fir::factory::readLowerBound(builder, loc, dataExv, dimension, one); + bool defaultLb = baseLb == one; + mlir::Value stride = one; + bool strideInBytes = false; + + if (fir::unwrapRefType(baseAddr.getType()).isa()) { + mlir::Value d = builder.createIntegerConstant(loc, idxTy, dimension); + auto dimInfo = builder.create(loc, idxTy, idxTy, idxTy, + baseAddr, d); + stride = dimInfo.getByteStride(); + strideInBytes = true; + } + + const auto &lower{std::get<0>(triplet->t)}; + if (lower) { + lval = Fortran::semantics::GetIntValue(lower); + if (lval) { + if (defaultLb) { + lbound = builder.createIntegerConstant(loc, idxTy, *lval - 1); + } else { + mlir::Value lb = builder.createIntegerConstant(loc, idxTy, *lval); + lbound = builder.create(loc, lb, baseLb); + } + asFortran << *lval; + } else { + const Fortran::lower::SomeExpr *lexpr = + Fortran::semantics::GetExpr(*lower); + mlir::Value lb = + fir::getBase(converter.genExprValue(loc, *lexpr, stmtCtx)); + lb = builder.createConvert(loc, baseLb.getType(), lb); + lbound = builder.create(loc, lb, baseLb); + asFortran << lexpr->AsFortran(); + } + } else { + lbound = defaultLb ? zero : baseLb; + } + asFortran << ':'; + const auto &upper{std::get<1>(triplet->t)}; + if (upper) { + uval = Fortran::semantics::GetIntValue(upper); + if (uval) { + if (defaultLb) { + ubound = builder.createIntegerConstant(loc, idxTy, *uval - 1); + } else { + mlir::Value ub = builder.createIntegerConstant(loc, idxTy, *uval); + ubound = builder.create(loc, ub, baseLb); + } + asFortran << *uval; + } else { + const Fortran::lower::SomeExpr *uexpr = + Fortran::semantics::GetExpr(*upper); + mlir::Value ub = + fir::getBase(converter.genExprValue(loc, *uexpr, stmtCtx)); + ub = builder.createConvert(loc, baseLb.getType(), ub); + ubound = builder.create(loc, ub, baseLb); + asFortran << uexpr->AsFortran(); + } + } + if (lower && upper) { + if (lval && uval && *uval < *lval) { + mlir::emitError(loc, "zero sized array section"); + break; + } else if (std::get<2>(triplet->t)) { + const auto &strideExpr{std::get<2>(triplet->t)}; + if (strideExpr) { + mlir::emitError(loc, "stride cannot be specified on " + "an OpenMP array section"); + break; + } + } + } + // ub = baseLb + extent - 1 + if (!ubound) { + mlir::Value ext = + fir::factory::readExtent(builder, loc, dataExv, dimension); + mlir::Value lbExt = + builder.create(loc, ext, baseLb); + ubound = builder.create(loc, lbExt, one); + } + mlir::Value bound = builder.create( + loc, boundTy, lbound, ubound, extent, stride, strideInBytes, baseLb); + bounds.push_back(bound); + ++dimension; + } + } + return bounds; +} + +template +mlir::Value gatherDataOperandAddrAndBounds( + Fortran::lower::AbstractConverter &converter, fir::FirOpBuilder &builder, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::StatementContext &stmtCtx, const ObjectType &object, + mlir::Location operandLocation, std::stringstream &asFortran, + llvm::SmallVector &bounds) { + mlir::Value baseAddr; + + std::visit( + Fortran::common::visitors{ + [&](const Fortran::parser::Designator &designator) { + if (auto expr{Fortran::semantics::AnalyzeExpr(semanticsContext, + designator)}) { + if ((*expr).Rank() > 0 && + Fortran::parser::Unwrap( + designator)) { + const auto *arrayElement = + Fortran::parser::Unwrap( + designator); + const auto *dataRef = + std::get_if(&designator.u); + fir::ExtendedValue dataExv; + if (Fortran::parser::Unwrap< + Fortran::parser::StructureComponent>( + arrayElement->base)) { + auto exprBase = Fortran::semantics::AnalyzeExpr( + semanticsContext, arrayElement->base); + dataExv = converter.genExprAddr(operandLocation, *exprBase, + stmtCtx); + baseAddr = fir::getBase(dataExv); + asFortran << (*exprBase).AsFortran(); + } else { + const Fortran::parser::Name &name = + Fortran::parser::GetLastName(*dataRef); + baseAddr = getDataOperandBaseAddr( + converter, builder, *name.symbol, operandLocation); + dataExv = converter.getSymbolExtendedValue(*name.symbol); + asFortran << name.ToString(); + } + + if (!arrayElement->subscripts.empty()) { + asFortran << '('; + bounds = genBoundsOps( + builder, operandLocation, converter, stmtCtx, + arrayElement->subscripts, asFortran, dataExv, baseAddr); + } + asFortran << ')'; + } else if (Fortran::parser::Unwrap< + Fortran::parser::StructureComponent>(designator)) { + fir::ExtendedValue compExv = + converter.genExprAddr(operandLocation, *expr, stmtCtx); + baseAddr = fir::getBase(compExv); + if (fir::unwrapRefType(baseAddr.getType()) + .isa()) + bounds = genBaseBoundsOps( + builder, operandLocation, converter, compExv, baseAddr); + asFortran << (*expr).AsFortran(); + + // If the component is an allocatable or pointer the result of + // genExprAddr will be the result of a fir.box_addr operation. + // Retrieve the box so we handle it like other descriptor. + if (auto boxAddrOp = mlir::dyn_cast_or_null( + baseAddr.getDefiningOp())) { + baseAddr = boxAddrOp.getVal(); + bounds = genBoundsOpsFromBox( + builder, operandLocation, converter, compExv, baseAddr); + } + } else { + // Scalar or full array. + if (const auto *dataRef{ + std::get_if(&designator.u)}) { + const Fortran::parser::Name &name = + Fortran::parser::GetLastName(*dataRef); + fir::ExtendedValue dataExv = + converter.getSymbolExtendedValue(*name.symbol); + baseAddr = getDataOperandBaseAddr( + converter, builder, *name.symbol, operandLocation); + if (fir::unwrapRefType(baseAddr.getType()) + .isa()) + bounds = genBoundsOpsFromBox( + builder, operandLocation, converter, dataExv, baseAddr); + if (fir::unwrapRefType(baseAddr.getType()) + .isa()) + bounds = genBaseBoundsOps( + builder, operandLocation, converter, dataExv, baseAddr); + asFortran << name.ToString(); + } else { // Unsupported + llvm::report_fatal_error( + "Unsupported type of OpenACC operand"); + } + } + } + }, + [&](const Fortran::parser::Name &name) { + baseAddr = getDataOperandBaseAddr(converter, builder, *name.symbol, + operandLocation); + asFortran << name.ToString(); + }}, + object.u); + return baseAddr; +} + } // namespace lower } // namespace Fortran diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 04078d739ea78..6d4443ef09ed0 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -35,308 +35,6 @@ static constexpr std::int64_t starCst = -1; static unsigned routineCounter = 0; static constexpr llvm::StringRef accRoutinePrefix = "acc_routine_"; -/// Generate the acc.bounds operation from the descriptor information. -static llvm::SmallVector -genBoundsOpsFromBox(fir::FirOpBuilder &builder, mlir::Location loc, - Fortran::lower::AbstractConverter &converter, - fir::ExtendedValue dataExv, mlir::Value box) { - llvm::SmallVector bounds; - mlir::Type idxTy = builder.getIndexType(); - mlir::Type boundTy = builder.getType(); - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - assert(box.getType().isa() && - "expect fir.box or fir.class"); - // Note that with the HLFIR lowering the 'box' is the FIR box - // which may not have correct local lbounds. As long as we only - // use the extents, it should be okay to read the dimensions - // from this box. - for (unsigned dim = 0; dim < dataExv.rank(); ++dim) { - mlir::Value d = builder.createIntegerConstant(loc, idxTy, dim); - mlir::Value baseLb = - fir::factory::readLowerBound(builder, loc, dataExv, dim, one); - auto dimInfo = - builder.create(loc, idxTy, idxTy, idxTy, box, d); - mlir::Value lb = builder.createIntegerConstant(loc, idxTy, 0); - mlir::Value ub = - builder.create(loc, dimInfo.getExtent(), one); - mlir::Value bound = builder.create( - loc, boundTy, lb, ub, mlir::Value(), dimInfo.getByteStride(), true, - baseLb); - bounds.push_back(bound); - } - return bounds; -} - -/// Generate acc.bounds operation for base array without any subscripts -/// provided. -static llvm::SmallVector -genBaseBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc, - Fortran::lower::AbstractConverter &converter, - fir::ExtendedValue dataExv, mlir::Value baseAddr) { - mlir::Type idxTy = builder.getIndexType(); - mlir::Type boundTy = builder.getType(); - llvm::SmallVector bounds; - - if (dataExv.rank() == 0) - return bounds; - - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - for (std::size_t dim = 0; dim < dataExv.rank(); ++dim) { - mlir::Value baseLb = - fir::factory::readLowerBound(builder, loc, dataExv, dim, one); - mlir::Value ext = fir::factory::readExtent(builder, loc, dataExv, dim); - mlir::Value lb = builder.createIntegerConstant(loc, idxTy, 0); - - // ub = extent - 1 - mlir::Value ub = builder.create(loc, ext, one); - mlir::Value bound = builder.create( - loc, boundTy, lb, ub, ext, one, false, baseLb); - bounds.push_back(bound); - } - return bounds; -} - -/// Generate acc.bounds operations for an array section when subscripts are -/// provided. -static llvm::SmallVector -genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc, - Fortran::lower::AbstractConverter &converter, - Fortran::lower::StatementContext &stmtCtx, - const std::list &subscripts, - std::stringstream &asFortran, fir::ExtendedValue &dataExv, - mlir::Value baseAddr) { - int dimension = 0; - mlir::Type idxTy = builder.getIndexType(); - mlir::Type boundTy = builder.getType(); - llvm::SmallVector bounds; - - mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - for (const auto &subscript : subscripts) { - if (const auto *triplet{ - std::get_if(&subscript.u)}) { - if (dimension != 0) - asFortran << ','; - mlir::Value lbound, ubound, extent; - std::optional lval, uval; - mlir::Value baseLb = - fir::factory::readLowerBound(builder, loc, dataExv, dimension, one); - bool defaultLb = baseLb == one; - mlir::Value stride = one; - bool strideInBytes = false; - - if (fir::unwrapRefType(baseAddr.getType()).isa()) { - mlir::Value d = builder.createIntegerConstant(loc, idxTy, dimension); - auto dimInfo = builder.create(loc, idxTy, idxTy, idxTy, - baseAddr, d); - stride = dimInfo.getByteStride(); - strideInBytes = true; - } - - const auto &lower{std::get<0>(triplet->t)}; - if (lower) { - lval = Fortran::semantics::GetIntValue(lower); - if (lval) { - if (defaultLb) { - lbound = builder.createIntegerConstant(loc, idxTy, *lval - 1); - } else { - mlir::Value lb = builder.createIntegerConstant(loc, idxTy, *lval); - lbound = builder.create(loc, lb, baseLb); - } - asFortran << *lval; - } else { - const Fortran::lower::SomeExpr *lexpr = - Fortran::semantics::GetExpr(*lower); - mlir::Value lb = - fir::getBase(converter.genExprValue(loc, *lexpr, stmtCtx)); - lb = builder.createConvert(loc, baseLb.getType(), lb); - lbound = builder.create(loc, lb, baseLb); - asFortran << lexpr->AsFortran(); - } - } else { - lbound = defaultLb ? zero : baseLb; - } - asFortran << ':'; - const auto &upper{std::get<1>(triplet->t)}; - if (upper) { - uval = Fortran::semantics::GetIntValue(upper); - if (uval) { - if (defaultLb) { - ubound = builder.createIntegerConstant(loc, idxTy, *uval - 1); - } else { - mlir::Value ub = builder.createIntegerConstant(loc, idxTy, *uval); - ubound = builder.create(loc, ub, baseLb); - } - asFortran << *uval; - } else { - const Fortran::lower::SomeExpr *uexpr = - Fortran::semantics::GetExpr(*upper); - mlir::Value ub = - fir::getBase(converter.genExprValue(loc, *uexpr, stmtCtx)); - ub = builder.createConvert(loc, baseLb.getType(), ub); - ubound = builder.create(loc, ub, baseLb); - asFortran << uexpr->AsFortran(); - } - } - if (lower && upper) { - if (lval && uval && *uval < *lval) { - mlir::emitError(loc, "zero sized array section"); - break; - } else if (std::get<2>(triplet->t)) { - const auto &strideExpr{std::get<2>(triplet->t)}; - if (strideExpr) { - mlir::emitError(loc, "stride cannot be specified on " - "an OpenACC array section"); - break; - } - } - } - // ub = baseLb + extent - 1 - if (!ubound) { - mlir::Value ext = - fir::factory::readExtent(builder, loc, dataExv, dimension); - mlir::Value lbExt = - builder.create(loc, ext, baseLb); - ubound = builder.create(loc, lbExt, one); - } - mlir::Value bound = builder.create( - loc, boundTy, lbound, ubound, extent, stride, strideInBytes, baseLb); - bounds.push_back(bound); - ++dimension; - } - } - return bounds; -} - -static mlir::Value -getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter, - fir::FirOpBuilder &builder, - Fortran::lower::SymbolRef sym, mlir::Location loc) { - mlir::Value symAddr = converter.getSymbolAddress(sym); - // TODO: Might need revisiting to handle for non-shared clauses - if (!symAddr) { - if (const auto *details = - sym->detailsIf()) - symAddr = converter.getSymbolAddress(details->symbol()); - } - - if (!symAddr) - llvm::report_fatal_error("could not retrieve symbol address"); - - if (auto boxTy = - fir::unwrapRefType(symAddr.getType()).dyn_cast()) { - if (boxTy.getEleTy().isa()) - TODO(loc, "derived type"); - - // Load the box when baseAddr is a `fir.ref>` or a - // `fir.ref>` type. - if (symAddr.getType().isa()) - return builder.create(loc, symAddr); - } - return symAddr; -} - -static mlir::Value gatherDataOperandAddrAndBounds( - Fortran::lower::AbstractConverter &converter, fir::FirOpBuilder &builder, - Fortran::semantics::SemanticsContext &semanticsContext, - Fortran::lower::StatementContext &stmtCtx, - const Fortran::parser::AccObject &accObject, mlir::Location operandLocation, - std::stringstream &asFortran, llvm::SmallVector &bounds) { - mlir::Value baseAddr; - std::visit( - Fortran::common::visitors{ - [&](const Fortran::parser::Designator &designator) { - if (auto expr{Fortran::semantics::AnalyzeExpr(semanticsContext, - designator)}) { - if ((*expr).Rank() > 0 && - Fortran::parser::Unwrap( - designator)) { - const auto *arrayElement = - Fortran::parser::Unwrap( - designator); - const auto *dataRef = - std::get_if(&designator.u); - fir::ExtendedValue dataExv; - if (Fortran::parser::Unwrap< - Fortran::parser::StructureComponent>( - arrayElement->base)) { - auto exprBase = Fortran::semantics::AnalyzeExpr( - semanticsContext, arrayElement->base); - dataExv = converter.genExprAddr(operandLocation, *exprBase, - stmtCtx); - baseAddr = fir::getBase(dataExv); - asFortran << (*exprBase).AsFortran(); - } else { - const Fortran::parser::Name &name = - Fortran::parser::GetLastName(*dataRef); - baseAddr = getDataOperandBaseAddr( - converter, builder, *name.symbol, operandLocation); - dataExv = converter.getSymbolExtendedValue(*name.symbol); - asFortran << name.ToString(); - } - - if (!arrayElement->subscripts.empty()) { - asFortran << '('; - bounds = genBoundsOps(builder, operandLocation, converter, - stmtCtx, arrayElement->subscripts, - asFortran, dataExv, baseAddr); - } - asFortran << ')'; - } else if (Fortran::parser::Unwrap< - Fortran::parser::StructureComponent>(designator)) { - fir::ExtendedValue compExv = - converter.genExprAddr(operandLocation, *expr, stmtCtx); - baseAddr = fir::getBase(compExv); - if (fir::unwrapRefType(baseAddr.getType()) - .isa()) - bounds = genBaseBoundsOps(builder, operandLocation, converter, - compExv, baseAddr); - asFortran << (*expr).AsFortran(); - - // If the component is an allocatable or pointer the result of - // genExprAddr will be the result of a fir.box_addr operation. - // Retrieve the box so we handle it like other descriptor. - if (auto boxAddrOp = mlir::dyn_cast_or_null( - baseAddr.getDefiningOp())) { - baseAddr = boxAddrOp.getVal(); - bounds = genBoundsOpsFromBox(builder, operandLocation, - converter, compExv, baseAddr); - } - } else { - // Scalar or full array. - if (const auto *dataRef{ - std::get_if(&designator.u)}) { - const Fortran::parser::Name &name = - Fortran::parser::GetLastName(*dataRef); - fir::ExtendedValue dataExv = - converter.getSymbolExtendedValue(*name.symbol); - baseAddr = getDataOperandBaseAddr( - converter, builder, *name.symbol, operandLocation); - if (fir::unwrapRefType(baseAddr.getType()) - .isa()) - bounds = genBoundsOpsFromBox(builder, operandLocation, - converter, dataExv, baseAddr); - if (fir::unwrapRefType(baseAddr.getType()) - .isa()) - bounds = genBaseBoundsOps(builder, operandLocation, - converter, dataExv, baseAddr); - asFortran << name.ToString(); - } else { // Unsupported - llvm::report_fatal_error( - "Unsupported type of OpenACC operand"); - } - } - } - }, - [&](const Fortran::parser::Name &name) { - baseAddr = getDataOperandBaseAddr(converter, builder, *name.symbol, - operandLocation); - asFortran << name.ToString(); - }}, - accObject.u); - return baseAddr; -} - static mlir::Location genOperandLocation(Fortran::lower::AbstractConverter &converter, const Fortran::parser::AccObject &accObject) { @@ -555,9 +253,10 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList, llvm::SmallVector bounds; std::stringstream asFortran; mlir::Location operandLocation = genOperandLocation(converter, accObject); - mlir::Value baseAddr = gatherDataOperandAddrAndBounds( - converter, builder, semanticsContext, stmtCtx, accObject, - operandLocation, asFortran, bounds); + mlir::Value baseAddr = Fortran::lower::gatherDataOperandAddrAndBounds< + Fortran::parser::AccObject, mlir::acc::DataBoundsType, + mlir::acc::DataBoundsOp>(converter, builder, semanticsContext, stmtCtx, + accObject, operandLocation, asFortran, bounds); Op op = createDataEntryOp(builder, operandLocation, baseAddr, asFortran, bounds, structured, implicit, dataClause, baseAddr.getType()); @@ -578,9 +277,10 @@ static void genDeclareDataOperandOperations( llvm::SmallVector bounds; std::stringstream asFortran; mlir::Location operandLocation = genOperandLocation(converter, accObject); - mlir::Value baseAddr = gatherDataOperandAddrAndBounds( - converter, builder, semanticsContext, stmtCtx, accObject, - operandLocation, asFortran, bounds); + mlir::Value baseAddr = Fortran::lower::gatherDataOperandAddrAndBounds< + Fortran::parser::AccObject, mlir::acc::DataBoundsType, + mlir::acc::DataBoundsOp>(converter, builder, semanticsContext, stmtCtx, + accObject, operandLocation, asFortran, bounds); EntryOp op = createDataEntryOp( builder, operandLocation, baseAddr, asFortran, bounds, structured, implicit, dataClause, baseAddr.getType()); @@ -796,9 +496,10 @@ genPrivatizations(const Fortran::parser::AccObjectList &objectList, llvm::SmallVector bounds; std::stringstream asFortran; mlir::Location operandLocation = genOperandLocation(converter, accObject); - mlir::Value baseAddr = gatherDataOperandAddrAndBounds( - converter, builder, semanticsContext, stmtCtx, accObject, - operandLocation, asFortran, bounds); + mlir::Value baseAddr = Fortran::lower::gatherDataOperandAddrAndBounds< + Fortran::parser::AccObject, mlir::acc::DataBoundsType, + mlir::acc::DataBoundsOp>(converter, builder, semanticsContext, stmtCtx, + accObject, operandLocation, asFortran, bounds); RecipeOp recipe; mlir::Type retTy = getTypeFromBounds(bounds, baseAddr.getType()); @@ -1196,9 +897,10 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList, llvm::SmallVector bounds; std::stringstream asFortran; mlir::Location operandLocation = genOperandLocation(converter, accObject); - mlir::Value baseAddr = gatherDataOperandAddrAndBounds( - converter, builder, semanticsContext, stmtCtx, accObject, - operandLocation, asFortran, bounds); + mlir::Value baseAddr = Fortran::lower::gatherDataOperandAddrAndBounds< + Fortran::parser::AccObject, mlir::acc::DataBoundsType, + mlir::acc::DataBoundsOp>(converter, builder, semanticsContext, stmtCtx, + accObject, operandLocation, asFortran, bounds); if (hasDynamicShape(bounds)) TODO(operandLocation, "OpenACC reductions with dynamic shaped array"); diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp index bc18898426aa8..5f5e968eaaa64 100644 --- a/flang/lib/Lower/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP.cpp @@ -41,16 +41,21 @@ using DeclareTargetCapturePair = static Fortran::semantics::Symbol * getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject) { Fortran::semantics::Symbol *sym = nullptr; - std::visit(Fortran::common::visitors{ - [&](const Fortran::parser::Designator &designator) { - if (const Fortran::parser::Name *name = + std::visit( + Fortran::common::visitors{ + [&](const Fortran::parser::Designator &designator) { + if (auto *arrayEle = + Fortran::parser::Unwrap( + designator)) { + sym = GetFirstName(arrayEle->base).symbol; + } else if (const Fortran::parser::Name *name = Fortran::semantics::getDesignatorNameIfDataRef( designator)) { - sym = name->symbol; - } - }, - [&](const Fortran::parser::Name &name) { sym = name.symbol; }}, - ompObject.u); + sym = name->symbol; + } + }, + [&](const Fortran::parser::Name &name) { sym = name.symbol; }}, + ompObject.u); return sym; } @@ -529,8 +534,11 @@ class ClauseProcessor { mlir::Value &result) const; bool processLink(llvm::SmallVectorImpl &result) const; - bool processMap(llvm::SmallVectorImpl &mapOperands, - llvm::SmallVectorImpl &mapTypes) const; + bool processMap(mlir::Location currentLocation, + const llvm::omp::Directive &directive, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::StatementContext &stmtCtx, + llvm::SmallVectorImpl &mapOperands) const; bool processReduction( mlir::Location currentLocation, llvm::SmallVectorImpl &reductionVars, @@ -1653,80 +1661,109 @@ bool ClauseProcessor::processLink( }); } +static mlir::omp::MapInfoOp +createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value baseAddr, std::stringstream &name, + mlir::SmallVector bounds, uint64_t mapType, + mlir::omp::VariableCaptureKind mapCaptureType, bool implicit, + mlir::Type retTy) { + mlir::Value varPtrPtr; + if (auto boxTy = baseAddr.getType().dyn_cast()) { + baseAddr = builder.create(loc, baseAddr); + retTy = baseAddr.getType(); + } + + mlir::omp::MapInfoOp op = + builder.create(loc, retTy, baseAddr); + op.setNameAttr(builder.getStringAttr(name.str())); + op.setImplicit(implicit); + op.setMapType(mapType); + op.setMapCaptureType(mapCaptureType); + + unsigned insPos = 1; + if (varPtrPtr) + op->insertOperands(insPos++, varPtrPtr); + if (bounds.size() > 0) + op->insertOperands(insPos, bounds); + op->setAttr(mlir::omp::MapInfoOp::getOperandSegmentSizeAttr(), + builder.getDenseI32ArrayAttr( + {1, varPtrPtr ? 1 : 0, static_cast(bounds.size())})); + return op; +} + bool ClauseProcessor::processMap( - llvm::SmallVectorImpl &mapOperands, - llvm::SmallVectorImpl &mapTypes) const { + mlir::Location currentLocation, const llvm::omp::Directive &directive, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::StatementContext &stmtCtx, + llvm::SmallVectorImpl &mapOperands) const { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + return findRepeatableClause( + [&](const ClauseTy::Map *mapClause, + const Fortran::parser::CharBlock &source) { + mlir::Location clauseLocation = converter.genLocation(source); + const auto &oMapType = + std::get>( + mapClause->v.t); + llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; + // If the map type is specified, then process it else Tofrom is the + // default. + if (oMapType) { + const Fortran::parser::OmpMapType::Type &mapType = + std::get(oMapType->t); + switch (mapType) { + case Fortran::parser::OmpMapType::Type::To: + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + break; + case Fortran::parser::OmpMapType::Type::From: + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + break; + case Fortran::parser::OmpMapType::Type::Tofrom: + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + break; + case Fortran::parser::OmpMapType::Type::Alloc: + case Fortran::parser::OmpMapType::Type::Release: + // alloc and release is the default map_type for the Target Data + // Ops, i.e. if no bits for map_type is supplied then alloc/release + // is implicitly assumed based on the target directive. Default + // value for Target Data and Enter Data is alloc and for Exit Data + // it is release. + break; + case Fortran::parser::OmpMapType::Type::Delete: + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; + } - return findRepeatableClause< - ClauseTy::Map>([&](const ClauseTy::Map *mapClause, - const Fortran::parser::CharBlock &source) { - mlir::Location clauseLocation = converter.genLocation(source); - const auto &oMapType = - std::get>(mapClause->v.t); - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; - // If the map type is specified, then process it else Tofrom is the default. - if (oMapType) { - const Fortran::parser::OmpMapType::Type &mapType = - std::get(oMapType->t); - switch (mapType) { - case Fortran::parser::OmpMapType::Type::To: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; - break; - case Fortran::parser::OmpMapType::Type::From: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - break; - case Fortran::parser::OmpMapType::Type::Tofrom: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - break; - case Fortran::parser::OmpMapType::Type::Alloc: - case Fortran::parser::OmpMapType::Type::Release: - // alloc and release is the default map_type for the Target Data Ops, - // i.e. if no bits for map_type is supplied then alloc/release is - // implicitly assumed based on the target directive. Default value for - // Target Data and Enter Data is alloc and for Exit Data it is release. - break; - case Fortran::parser::OmpMapType::Type::Delete: - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; - } - - if (std::get>( - oMapType->t)) - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; - } else { - mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; - } - - // TODO: Add support MapTypeModifiers close, mapper, present, iterator - - mlir::IntegerAttr mapTypeAttr = firOpBuilder.getIntegerAttr( - firOpBuilder.getI64Type(), - static_cast< - std::underlying_type_t>( - mapTypeBits)); - - llvm::SmallVector mapOperand; - // Check for unsupported map operand types. - for (const Fortran::parser::OmpObject &ompObject : - std::get(mapClause->v.t).v) { - if (Fortran::parser::Unwrap(ompObject) || - Fortran::parser::Unwrap( - ompObject)) - TODO(clauseLocation, - "OMPD_target_data for Array Expressions or Structure Components"); - } - genObjectList(std::get(mapClause->v.t), - converter, mapOperand); + if (std::get>( + oMapType->t)) + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; + } else { + mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + } - for (mlir::Value mapOp : mapOperand) { - checkMapType(mapOp.getLoc(), mapOp.getType()); - mapOperands.push_back(mapOp); - mapTypes.push_back(mapTypeAttr); - } - }); + for (const Fortran::parser::OmpObject &ompObject : + std::get(mapClause->v.t).v) { + llvm::SmallVector bounds; + std::stringstream asFortran; + mlir::Value baseAddr = Fortran::lower::gatherDataOperandAddrAndBounds< + Fortran::parser::OmpObject, mlir::omp::DataBoundsType, + mlir::omp::DataBoundsOp>(converter, firOpBuilder, + semanticsContext, stmtCtx, ompObject, + clauseLocation, asFortran, bounds); + + // Explicit map captures are captured ByRef by default, + // optimisation passes may alter this to ByCopy or other capture + // types to optimise + mapOperands.push_back(createMapInfoOp( + firOpBuilder, clauseLocation, baseAddr, asFortran, bounds, + static_cast< + std::underlying_type_t>( + mapTypeBits), + mlir::omp::VariableCaptureKind::ByRef, false, + baseAddr.getType())); + } + }); } bool ClauseProcessor::processReduction( @@ -2293,14 +2330,13 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter, static mlir::omp::DataOp genDataOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semanticsContext, mlir::Location currentLocation, const Fortran::parser::OmpClauseList &clauseList) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); Fortran::lower::StatementContext stmtCtx; mlir::Value ifClauseOperand, deviceOperand; llvm::SmallVector mapOperands, devicePtrOperands, deviceAddrOperands; - llvm::SmallVector mapTypes; llvm::SmallVector useDeviceTypes; llvm::SmallVector useDeviceLocs; llvm::SmallVector useDeviceSymbols; @@ -2314,16 +2350,12 @@ genDataOp(Fortran::lower::AbstractConverter &converter, useDeviceSymbols); cp.processUseDeviceAddr(deviceAddrOperands, useDeviceTypes, useDeviceLocs, useDeviceSymbols); - cp.processMap(mapOperands, mapTypes); - - llvm::SmallVector mapTypesAttr(mapTypes.begin(), - mapTypes.end()); - mlir::ArrayAttr mapTypesArrayAttr = - mlir::ArrayAttr::get(firOpBuilder.getContext(), mapTypesAttr); + cp.processMap(currentLocation, llvm::omp::Directive::OMPD_target_data, + semanticsContext, stmtCtx, mapOperands); auto dataOp = converter.getFirOpBuilder().create( currentLocation, ifClauseOperand, deviceOperand, devicePtrOperands, - deviceAddrOperands, mapOperands, mapTypesArrayAttr); + deviceAddrOperands, mapOperands); createBodyOfTargetDataOp(converter, dataOp, useDeviceTypes, useDeviceLocs, useDeviceSymbols, currentLocation); return dataOp; @@ -2332,6 +2364,7 @@ genDataOp(Fortran::lower::AbstractConverter &converter, template static OpTy genEnterExitDataOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semanticsContext, mlir::Location currentLocation, const Fortran::parser::OmpClauseList &clauseList) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); @@ -2339,7 +2372,6 @@ genEnterExitDataOp(Fortran::lower::AbstractConverter &converter, mlir::Value ifClauseOperand, deviceOperand; mlir::UnitAttr nowaitAttr; llvm::SmallVector mapOperands; - llvm::SmallVector mapTypes; Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName; llvm::omp::Directive directive; @@ -2359,32 +2391,26 @@ genEnterExitDataOp(Fortran::lower::AbstractConverter &converter, cp.processIf(stmtCtx, directiveName, ifClauseOperand); cp.processDevice(stmtCtx, deviceOperand); cp.processNowait(nowaitAttr); - cp.processMap(mapOperands, mapTypes); + cp.processMap(currentLocation, directive, semanticsContext, stmtCtx, + mapOperands); cp.processTODO(currentLocation, directive); - llvm::SmallVector mapTypesAttr(mapTypes.begin(), - mapTypes.end()); - mlir::ArrayAttr mapTypesArrayAttr = - mlir::ArrayAttr::get(firOpBuilder.getContext(), mapTypesAttr); - return firOpBuilder.create(currentLocation, ifClauseOperand, - deviceOperand, nowaitAttr, mapOperands, - mapTypesArrayAttr); + deviceOperand, nowaitAttr, mapOperands); } static mlir::omp::TargetOp genTargetOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, + Fortran::semantics::SemanticsContext &semanticsContext, mlir::Location currentLocation, const Fortran::parser::OmpClauseList &clauseList, - bool outerCombined = false) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + llvm::omp::Directive directive, bool outerCombined = false) { Fortran::lower::StatementContext stmtCtx; mlir::Value ifClauseOperand, deviceOperand, threadLimitOperand; mlir::UnitAttr nowaitAttr; llvm::SmallVector mapOperands; - llvm::SmallVector mapTypes; ClauseProcessor cp(converter, clauseList); cp.processIf(stmtCtx, @@ -2393,7 +2419,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, cp.processDevice(stmtCtx, deviceOperand); cp.processThreadLimit(stmtCtx, threadLimitOperand); cp.processNowait(nowaitAttr); - cp.processMap(mapOperands, mapTypes); + cp.processMap(currentLocation, directive, semanticsContext, stmtCtx, + mapOperands); cp.processTODO( currentLocation, llvm::omp::Directive::OMPD_target); - llvm::SmallVector mapTypesAttr(mapTypes.begin(), - mapTypes.end()); - mlir::ArrayAttr mapTypesArrayAttr = - mlir::ArrayAttr::get(firOpBuilder.getContext(), mapTypesAttr); - return genOpWithBody( converter, eval, currentLocation, outerCombined, &clauseList, ifClauseOperand, deviceOperand, threadLimitOperand, nowaitAttr, - mapOperands, mapTypesArrayAttr); + mapOperands); } static mlir::omp::TeamsOp @@ -2523,6 +2545,7 @@ getDeclareTargetFunctionDevice( static void genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, + Fortran::semantics::SemanticsContext &semanticsContext, const Fortran::parser::OpenMPSimpleStandaloneConstruct &simpleStandaloneConstruct) { const auto &directive = @@ -2550,15 +2573,15 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter, firOpBuilder.create(currentLocation); break; case llvm::omp::Directive::OMPD_target_data: - genDataOp(converter, currentLocation, opClauseList); + genDataOp(converter, semanticsContext, currentLocation, opClauseList); break; case llvm::omp::Directive::OMPD_target_enter_data: - genEnterExitDataOp(converter, currentLocation, - opClauseList); + genEnterExitDataOp(converter, semanticsContext, + currentLocation, opClauseList); break; case llvm::omp::Directive::OMPD_target_exit_data: - genEnterExitDataOp(converter, currentLocation, - opClauseList); + genEnterExitDataOp(converter, semanticsContext, + currentLocation, opClauseList); break; case llvm::omp::Directive::OMPD_target_update: TODO(currentLocation, "OMPD_target_update"); @@ -2588,12 +2611,14 @@ genOmpFlush(Fortran::lower::AbstractConverter &converter, static void genOMP(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, + Fortran::semantics::SemanticsContext &semanticsContext, const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) { std::visit( Fortran::common::visitors{ [&](const Fortran::parser::OpenMPSimpleStandaloneConstruct &simpleStandaloneConstruct) { - genOmpSimpleStandalone(converter, eval, simpleStandaloneConstruct); + genOmpSimpleStandalone(converter, eval, semanticsContext, + simpleStandaloneConstruct); }, [&](const Fortran::parser::OpenMPFlushConstruct &flushConstruct) { genOmpFlush(converter, eval, flushConstruct); @@ -2611,6 +2636,7 @@ genOMP(Fortran::lower::AbstractConverter &converter, static void genOMP(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, + Fortran::semantics::SemanticsContext &semanticsContext, const Fortran::parser::OpenMPLoopConstruct &loopConstruct) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); llvm::SmallVector lowerBound, upperBound, step, linearVars, @@ -2644,8 +2670,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, if ((llvm::omp::allTargetSet & llvm::omp::loopConstructSet) .test(ompDirective)) { validDirective = true; - genTargetOp(converter, eval, currentLocation, loopOpClauseList, - /*outerCombined=*/true); + genTargetOp(converter, eval, semanticsContext, currentLocation, + loopOpClauseList, ompDirective, /*outerCombined=*/true); } if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet) .test(ompDirective)) { @@ -2770,6 +2796,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, static void genOMP(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, + Fortran::semantics::SemanticsContext &semanticsContext, const Fortran::parser::OpenMPBlockConstruct &blockConstruct) { const auto &beginBlockDirective = std::get(blockConstruct.t); @@ -2831,10 +2858,11 @@ genOMP(Fortran::lower::AbstractConverter &converter, endClauseList); break; case llvm::omp::Directive::OMPD_target: - genTargetOp(converter, eval, currentLocation, beginClauseList); + genTargetOp(converter, eval, semanticsContext, currentLocation, + beginClauseList, directive.v); break; case llvm::omp::Directive::OMPD_target_data: - genDataOp(converter, currentLocation, beginClauseList); + genDataOp(converter, semanticsContext, currentLocation, beginClauseList); break; case llvm::omp::Directive::OMPD_task: genTaskOp(converter, eval, currentLocation, beginClauseList); @@ -2854,8 +2882,8 @@ genOMP(Fortran::lower::AbstractConverter &converter, bool combinedDirective = false; if ((llvm::omp::allTargetSet & llvm::omp::blockConstructSet) .test(directive.v)) { - genTargetOp(converter, eval, currentLocation, beginClauseList, - /*outerCombined=*/true); + genTargetOp(converter, eval, semanticsContext, currentLocation, + beginClauseList, directive.v, /*outerCombined=*/true); combinedDirective = true; } if ((llvm::omp::allTeamsSet & llvm::omp::blockConstructSet) @@ -3097,13 +3125,14 @@ void Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder, void Fortran::lower::genOpenMPConstruct( Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semanticsContext, Fortran::lower::pft::Evaluation &eval, const Fortran::parser::OpenMPConstruct &ompConstruct) { std::visit( common::visitors{ [&](const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) { - genOMP(converter, eval, standaloneConstruct); + genOMP(converter, eval, semanticsContext, standaloneConstruct); }, [&](const Fortran::parser::OpenMPSectionsConstruct §ionsConstruct) { @@ -3113,7 +3142,7 @@ void Fortran::lower::genOpenMPConstruct( genOMP(converter, eval, sectionConstruct); }, [&](const Fortran::parser::OpenMPLoopConstruct &loopConstruct) { - genOMP(converter, eval, loopConstruct); + genOMP(converter, eval, semanticsContext, loopConstruct); }, [&](const Fortran::parser::OpenMPDeclarativeAllocate &execAllocConstruct) { @@ -3128,7 +3157,7 @@ void Fortran::lower::genOpenMPConstruct( TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct"); }, [&](const Fortran::parser::OpenMPBlockConstruct &blockConstruct) { - genOMP(converter, eval, blockConstruct); + genOMP(converter, eval, semanticsContext, blockConstruct); }, [&](const Fortran::parser::OpenMPAtomicConstruct &atomicConstruct) { genOMP(converter, eval, atomicConstruct); diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 0ec7e570395e8..6a6a10f632648 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -34,6 +34,7 @@ #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/Transforms/AddComdats.h" #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/BuiltinTypes.h" @@ -3861,6 +3862,14 @@ class FIRToLLVMLowering std::move(pattern)))) { signalPassFailure(); } + + // Run pass to add comdats to functions that have weak linkage on relevant platforms + if (fir::getTargetTriple(mod).supportsCOMDAT()) { + mlir::OpPassManager comdatPM("builtin.module"); + comdatPM.addPass(mlir::LLVM::createLLVMAddComdats()); + if (mlir::failed(runPipeline(comdatPM, mod))) + return signalPassFailure(); + } } private: diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 80567b19f9fe5..962b87acd5a80 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -1947,7 +1947,9 @@ void fir::IterWhileOp::print(mlir::OpAsmPrinter &p) { /*printBlockTerminators=*/true); } -mlir::Region &fir::IterWhileOp::getLoopBody() { return getRegion(); } +llvm::SmallVector fir::IterWhileOp::getLoopRegions() { + return {&getRegion()}; +} mlir::BlockArgument fir::IterWhileOp::iterArgToBlockArg(mlir::Value iterArg) { for (auto i : llvm::enumerate(getInitArgs())) @@ -2234,7 +2236,9 @@ void fir::DoLoopOp::print(mlir::OpAsmPrinter &p) { printBlockTerminators); } -mlir::Region &fir::DoLoopOp::getLoopBody() { return getRegion(); } +llvm::SmallVector fir::DoLoopOp::getLoopRegions() { + return {&getRegion()}; +} /// Translate a value passed as an iter_arg to the corresponding block /// argument in the body of the loop. diff --git a/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp b/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp index 6bbcbf84ac0b3..20ef66e0ad48c 100644 --- a/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp +++ b/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp @@ -29,18 +29,112 @@ class OMPEarlyOutliningPass return std::string(parentName) + "_omp_outline_" + std::to_string(count); } + // Given a value this function will iterate over an operators results + // and return the relevant index for the result the value corresponds to. + // There may be a simpler way to do this however. + unsigned getResultIndex(mlir::Value value, mlir::Operation *op) { + for (unsigned i = 0; i < op->getNumResults(); ++i) { + if (op->getResult(i) == value) + return i; + } + return 0; + } + + bool isDeclareTargetOp(mlir::Operation *op) { + if (fir::AddrOfOp addressOfOp = mlir::dyn_cast(op)) + if (fir::GlobalOp gOp = mlir::dyn_cast( + addressOfOp->getParentOfType().lookupSymbol( + addressOfOp.getSymbol()))) + if (auto declareTargetGlobal = + llvm::dyn_cast( + gOp.getOperation())) + if (declareTargetGlobal.isDeclareTarget()) + return true; + return false; + } + + // Currently used for cloning arguments that are nested. Should be + // extendable where required, perhaps via operation + // specialisation/overloading, if something needs specialised handling. + // NOTE: Results in duplication of some values that would otherwise be + // a single SSA value shared between operations, this is tidied up on + // lowering to some extent. + mlir::Operation * + cloneArgAndChildren(mlir::OpBuilder &builder, mlir::Operation *op, + llvm::SetVector &inputs, + mlir::Block::BlockArgListType &newInputs) { + mlir::IRMapping valueMap; + for (auto opValue : op->getOperands()) { + if (opValue.getDefiningOp()) { + auto resIdx = getResultIndex(opValue, opValue.getDefiningOp()); + valueMap.map(opValue, + cloneArgAndChildren(builder, opValue.getDefiningOp(), + inputs, newInputs) + ->getResult(resIdx)); + } else { + for (auto inArg : llvm::zip(inputs, newInputs)) { + if (opValue == std::get<0>(inArg)) + valueMap.map(opValue, std::get<1>(inArg)); + } + } + } + + return builder.clone(*op, valueMap); + } + + void cloneMapOpVariables(mlir::OpBuilder &builder, mlir::IRMapping &valueMap, + mlir::IRMapping &mapInfoMap, + llvm::SetVector &inputs, + mlir::Block::BlockArgListType &newInputs, + mlir::Value varPtr) { + if (fir::BoxAddrOp boxAddrOp = + mlir::dyn_cast_if_present(varPtr.getDefiningOp())) { + mlir::Value newV = + cloneArgAndChildren(builder, boxAddrOp, inputs, newInputs) + ->getResult(0); + mapInfoMap.map(varPtr, newV); + valueMap.map(boxAddrOp, newV); + return; + } + + if (varPtr.getDefiningOp() && isDeclareTargetOp(varPtr.getDefiningOp())) { + fir::AddrOfOp addrOp = + mlir::dyn_cast(varPtr.getDefiningOp()); + mlir::Value newV = builder.clone(*addrOp)->getResult(0); + mapInfoMap.map(varPtr, newV); + valueMap.map(addrOp, newV); + return; + } + + for (auto inArg : llvm::zip(inputs, newInputs)) { + if (varPtr == std::get<0>(inArg)) + mapInfoMap.map(varPtr, std::get<1>(inArg)); + } + } + mlir::func::FuncOp outlineTargetOp(mlir::OpBuilder &builder, mlir::omp::TargetOp &targetOp, mlir::func::FuncOp &parentFunc, unsigned count) { + // NOTE: once implicit captures are handled appropriately in the initial + // PFT lowering if it is possible, we can remove the usage of + // getUsedValuesDefinedAbove and instead just iterate over the target op's + // operands (or just the map arguments) and perhaps refactor this function + // a little. // Collect inputs llvm::SetVector inputs; - for (auto operand : targetOp.getOperation()->getOperands()) - inputs.insert(operand); - mlir::Region &targetRegion = targetOp.getRegion(); mlir::getUsedValuesDefinedAbove(targetRegion, inputs); + // filter out declareTarget and map entries which are specially handled + // at the moment, so we do not wish these to end up as function arguments + // which would just be more noise in the IR. + for (auto value : inputs) + if (value.getDefiningOp()) + if (mlir::isa(value.getDefiningOp()) || + isDeclareTargetOp(value.getDefiningOp())) + inputs.remove(value); + // Create new function and initialize mlir::FunctionType funcType = builder.getFunctionType( mlir::TypeRange(inputs.getArrayRef()), mlir::TypeRange()); @@ -51,7 +145,7 @@ class OMPEarlyOutliningPass mlir::func::FuncOp::create(loc, funcName, funcType); mlir::Block *entryBlock = newFunc.addEntryBlock(); builder.setInsertionPointToStart(entryBlock); - mlir::ValueRange newInputs = entryBlock->getArguments(); + mlir::Block::BlockArgListType newInputs = entryBlock->getArguments(); // Set the declare target information, the outlined function // is always a host function. @@ -68,10 +162,47 @@ class OMPEarlyOutliningPass newFunc.getOperation())) earlyOutlineOp.setParentName(parentName); - // Create input map from inputs to function parameters. + // The value map for the newly generated Target Operation, we must + // remap most of the input. mlir::IRMapping valueMap; - for (auto InArg : llvm::zip(inputs, newInputs)) - valueMap.map(std::get<0>(InArg), std::get<1>(InArg)); + + // Special handling for map, declare target and regular map variables + // are handled slightly differently for the moment, declare target has + // its addressOfOp cloned over, whereas we skip it for the regular map + // variables. We need knowledge of which global is linked to the map + // operation for declare target, whereas we aren't bothered for the + // regular map variables for the moment. We could treat both the same, + // however, cloning across the minimum for the moment to avoid + // optimisations breaking segments of the lowering seems prudent as this + // was the original intent of the pass. + for (auto oper : targetOp.getOperation()->getOperands()) { + if (auto mapEntry = + mlir::dyn_cast(oper.getDefiningOp())) { + mlir::IRMapping mapInfoMap; + for (auto bound : mapEntry.getBounds()) { + if (auto mapEntryBound = mlir::dyn_cast( + bound.getDefiningOp())) { + mapInfoMap.map(bound, cloneArgAndChildren(builder, mapEntryBound, + inputs, newInputs) + ->getResult(0)); + } + } + + cloneMapOpVariables(builder, valueMap, mapInfoMap, inputs, newInputs, + mapEntry.getVarPtr()); + + if (mapEntry.getVarPtrPtr()) + cloneMapOpVariables(builder, valueMap, mapInfoMap, inputs, newInputs, + mapEntry.getVarPtrPtr()); + + valueMap.map( + mapEntry, + builder.clone(*mapEntry.getOperation(), mapInfoMap)->getResult(0)); + } + } + + for (auto inArg : llvm::zip(inputs, newInputs)) + valueMap.map(std::get<0>(inArg), std::get<1>(inArg)); // Clone the target op into the new function builder.clone(*(targetOp.getOperation()), valueMap); diff --git a/flang/test/Fir/comdat.fir b/flang/test/Fir/comdat.fir new file mode 100644 index 0000000000000..2f5da505e4903 --- /dev/null +++ b/flang/test/Fir/comdat.fir @@ -0,0 +1,41 @@ + +// RUN: fir-opt %s --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes="CHECK-COMDAT" +// RUN: fir-opt %s --fir-to-llvm-ir="target=x86_64-pc-windows-msvc" | FileCheck %s --check-prefixes="CHECK-COMDAT" +// RUN: fir-opt %s --fir-to-llvm-ir="target=aarch64-apple-darwin" | FileCheck %s --check-prefixes="CHECK-NOCOMDAT" +// RUN: fir-opt %s --fir-to-llvm-ir="target=powerpc64-ibm-aix" | FileCheck %s --check-prefixes="CHECK-NOCOMDAT" + +// CHECK-COMDAT: llvm.func linkonce @fun_linkonce(%arg0: i32) -> i32 comdat(@__llvm_comdat::@fun_linkonce) +// CHECK-NOCOMDAT: llvm.func linkonce @fun_linkonce(%arg0: i32) -> i32 { +func.func @fun_linkonce(%arg0: i32) -> i32 attributes {llvm.linkage = #llvm.linkage} { + return %arg0 : i32 +} + +// CHECK-COMDAT: llvm.func linkonce_odr @fun_linkonce_odr(%arg0: i32) -> i32 comdat(@__llvm_comdat::@fun_linkonce_odr) +// CHECK-NOCOMDAT: llvm.func linkonce_odr @fun_linkonce_odr(%arg0: i32) -> i32 { +func.func @fun_linkonce_odr(%arg0: i32) -> i32 attributes {llvm.linkage = #llvm.linkage} { + return %arg0 : i32 +} + +// CHECK-COMDAT: llvm.mlir.global linkonce constant @global_linkonce() comdat(@__llvm_comdat::@global_linkonce) {addr_space = 0 : i32} : i32 +// CHECK-NOCOMDAT: llvm.mlir.global linkonce constant @global_linkonce() {addr_space = 0 : i32} : i32 { +fir.global linkonce @global_linkonce constant : i32 { + %0 = arith.constant 0 : i32 + fir.has_value %0 : i32 +} + +// CHECK-COMDAT: llvm.comdat @__llvm_comdat { +// CHECK-COMDAT: llvm.comdat_selector @fun_linkonce_odr any +// CHECK-COMDAT: llvm.comdat_selector @fun_linkonce any +// CHECK-COMDAT: llvm.comdat_selector @global_linkonce any +// CHECK-COMDAT: llvm.comdat_selector @global_linkonce_odr any + +// CHECK-NOCOMDAT-NOT: llvm.comdat +// CHECK-NOCOMDAT-NOT: llvm.comdat_selector + + +// CHECK-COMDAT: llvm.mlir.global linkonce_odr constant @global_linkonce_odr() comdat(@__llvm_comdat::@global_linkonce_odr) {addr_space = 0 : i32} : i32 +// CHECK-NOCOMDAT: llvm.mlir.global linkonce_odr constant @global_linkonce_odr() {addr_space = 0 : i32} : i32 { +fir.global linkonce_odr @global_linkonce_odr constant : i32 { + %0 = arith.constant 0 : i32 + fir.has_value %0 : i32 +} \ No newline at end of file diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir index 28b89838a6804..501a2bee29321 100644 --- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir +++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir @@ -220,35 +220,119 @@ func.func @_QPsimd1(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref // ----- func.func @_QPomp_target_data() { + %c1024 = arith.constant 1024 : index %0 = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_dataEa"} + %c1024_0 = arith.constant 1024 : index %1 = fir.alloca !fir.array<1024xi32> {bindc_name = "b", uniq_name = "_QFomp_target_dataEb"} + %c1024_1 = arith.constant 1024 : index %2 = fir.alloca !fir.array<1024xi32> {bindc_name = "c", uniq_name = "_QFomp_target_dataEc"} + %c1024_2 = arith.constant 1024 : index %3 = fir.alloca !fir.array<1024xi32> {bindc_name = "d", uniq_name = "_QFomp_target_dataEd"} - omp.target_enter_data map((to -> %0 : !fir.ref>), (to -> %1 : !fir.ref>), (always, alloc -> %2 : !fir.ref>)) - omp.target_exit_data map((from -> %0 : !fir.ref>), (from -> %1 : !fir.ref>), (release -> %2 : !fir.ref>), (always, delete -> %3 : !fir.ref>)) + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %4 = arith.subi %c1024, %c1 : index + %5 = omp.bounds lower_bound(%c0 : index) upper_bound(%4 : index) extent(%c1024 : index) stride(%c1 : index) start_idx(%c1 : index) + %6 = omp.map_info var_ptr(%0 : !fir.ref>) map_clauses(to) capture(ByRef) bounds(%5) -> !fir.ref> {name = "a"} + %c1_3 = arith.constant 1 : index + %c0_4 = arith.constant 0 : index + %7 = arith.subi %c1024_0, %c1_3 : index + %8 = omp.bounds lower_bound(%c0_4 : index) upper_bound(%7 : index) extent(%c1024_0 : index) stride(%c1_3 : index) start_idx(%c1_3 : index) + %9 = omp.map_info var_ptr(%1 : !fir.ref>) map_clauses(to) capture(ByRef) bounds(%8) -> !fir.ref> {name = "b"} + %c1_5 = arith.constant 1 : index + %c0_6 = arith.constant 0 : index + %10 = arith.subi %c1024_1, %c1_5 : index + %11 = omp.bounds lower_bound(%c0_6 : index) upper_bound(%10 : index) extent(%c1024_1 : index) stride(%c1_5 : index) start_idx(%c1_5 : index) + %12 = omp.map_info var_ptr(%2 : !fir.ref>) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%11) -> !fir.ref> {name = "c"} + omp.target_enter_data map_entries(%6, %9, %12 : !fir.ref>, !fir.ref>, !fir.ref>) + %c1_7 = arith.constant 1 : index + %c0_8 = arith.constant 0 : index + %13 = arith.subi %c1024, %c1_7 : index + %14 = omp.bounds lower_bound(%c0_8 : index) upper_bound(%13 : index) extent(%c1024 : index) stride(%c1_7 : index) start_idx(%c1_7 : index) + %15 = omp.map_info var_ptr(%0 : !fir.ref>) map_clauses(from) capture(ByRef) bounds(%14) -> !fir.ref> {name = "a"} + %c1_9 = arith.constant 1 : index + %c0_10 = arith.constant 0 : index + %16 = arith.subi %c1024_0, %c1_9 : index + %17 = omp.bounds lower_bound(%c0_10 : index) upper_bound(%16 : index) extent(%c1024_0 : index) stride(%c1_9 : index) start_idx(%c1_9 : index) + %18 = omp.map_info var_ptr(%1 : !fir.ref>) map_clauses(from) capture(ByRef) bounds(%17) -> !fir.ref> {name = "b"} + %c1_11 = arith.constant 1 : index + %c0_12 = arith.constant 0 : index + %19 = arith.subi %c1024_1, %c1_11 : index + %20 = omp.bounds lower_bound(%c0_12 : index) upper_bound(%19 : index) extent(%c1024_1 : index) stride(%c1_11 : index) start_idx(%c1_11 : index) + %21 = omp.map_info var_ptr(%2 : !fir.ref>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%20) -> !fir.ref> {name = "c"} + %c1_13 = arith.constant 1 : index + %c0_14 = arith.constant 0 : index + %22 = arith.subi %c1024_2, %c1_13 : index + %23 = omp.bounds lower_bound(%c0_14 : index) upper_bound(%22 : index) extent(%c1024_2 : index) stride(%c1_13 : index) start_idx(%c1_13 : index) + %24 = omp.map_info var_ptr(%3 : !fir.ref>) map_clauses(always, delete) capture(ByRef) bounds(%23) -> !fir.ref> {name = "d"} + omp.target_exit_data map_entries(%15, %18, %21, %24 : !fir.ref>, !fir.ref>, !fir.ref>, !fir.ref>) return } -// CHECK-LABEL: llvm.func @_QPomp_target_data() { -// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64 -// CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_target_dataEa"} : (i64) -> !llvm.ptr> -// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64 -// CHECK: %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x !llvm.array<1024 x i32> {bindc_name = "b", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_target_dataEb"} : (i64) -> !llvm.ptr> -// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64 -// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_4]] x !llvm.array<1024 x i32> {bindc_name = "c", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_target_dataEc"} : (i64) -> !llvm.ptr> -// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : i64) : i64 -// CHECK: %[[VAL_7:.*]] = llvm.alloca %[[VAL_6]] x !llvm.array<1024 x i32> {bindc_name = "d", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_target_dataEd"} : (i64) -> !llvm.ptr> -// CHECK: omp.target_enter_data map((to -> %[[VAL_1]] : !llvm.ptr>), (to -> %[[VAL_3]] : !llvm.ptr>), (always, alloc -> %[[VAL_5]] : !llvm.ptr>)) -// CHECK: omp.target_exit_data map((from -> %[[VAL_1]] : !llvm.ptr>), (from -> %[[VAL_3]] : !llvm.ptr>), (release -> %[[VAL_5]] : !llvm.ptr>), (always, delete -> %[[VAL_7]] : !llvm.ptr>)) -// CHECK: llvm.return -// CHECK: } + // CHECK-LABEL: llvm.func @_QPomp_target_data() { + // CHECK: %0 = llvm.mlir.constant(1024 : index) : i64 + // CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_target_dataEa"} : (i64) -> !llvm.ptr> + // CHECK: %3 = llvm.mlir.constant(1024 : index) : i64 + // CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK: %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x !llvm.array<1024 x i32> {bindc_name = "b", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_target_dataEb"} : (i64) -> !llvm.ptr> + // CHECK: %6 = llvm.mlir.constant(1024 : index) : i64 + // CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_4]] x !llvm.array<1024 x i32> {bindc_name = "c", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_target_dataEc"} : (i64) -> !llvm.ptr> + // CHECK: %9 = llvm.mlir.constant(1024 : index) : i64 + // CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK: %[[VAL_7:.*]] = llvm.alloca %[[VAL_6]] x !llvm.array<1024 x i32> {bindc_name = "d", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_target_dataEd"} : (i64) -> !llvm.ptr> + // CHECK: %12 = llvm.mlir.constant(1 : index) : i64 + // CHECK: %13 = llvm.mlir.constant(0 : index) : i64 + // CHECK: %14 = llvm.mlir.constant(1023 : index) : i64 + // CHECK: %15 = omp.bounds lower_bound(%13 : i64) upper_bound(%14 : i64) extent(%0 : i64) stride(%12 : i64) start_idx(%12 : i64) + // CHECK: %16 = omp.map_info var_ptr(%[[VAL_1]] : !llvm.ptr>) map_clauses(to) capture(ByRef) bounds(%15) -> !llvm.ptr> {name = "a"} + // CHECK: %17 = llvm.mlir.constant(1 : index) : i64 + // CHECK: %18 = llvm.mlir.constant(0 : index) : i64 + // CHECK: %19 = llvm.mlir.constant(1023 : index) : i64 + // CHECK: %20 = omp.bounds lower_bound(%18 : i64) upper_bound(%19 : i64) extent(%3 : i64) stride(%17 : i64) start_idx(%17 : i64) + // CHECK: %21 = omp.map_info var_ptr(%[[VAL_3]] : !llvm.ptr>) map_clauses(to) capture(ByRef) bounds(%20) -> !llvm.ptr> {name = "b"} + // CHECK: %22 = llvm.mlir.constant(1 : index) : i64 + // CHECK: %23 = llvm.mlir.constant(0 : index) : i64 + // CHECK: %24 = llvm.mlir.constant(1023 : index) : i64 + // CHECK: %25 = omp.bounds lower_bound(%23 : i64) upper_bound(%24 : i64) extent(%6 : i64) stride(%22 : i64) start_idx(%22 : i64) + // CHECK: %26 = omp.map_info var_ptr(%[[VAL_5]] : !llvm.ptr>) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%25) -> !llvm.ptr> {name = "c"} + // CHECK: omp.target_enter_data map_entries(%16, %21, %26 : !llvm.ptr>, !llvm.ptr>, !llvm.ptr>) + // CHECK: %27 = llvm.mlir.constant(1 : index) : i64 + // CHECK: %28 = llvm.mlir.constant(0 : index) : i64 + // CHECK: %29 = llvm.mlir.constant(1023 : index) : i64 + // CHECK: %30 = omp.bounds lower_bound(%28 : i64) upper_bound(%29 : i64) extent(%0 : i64) stride(%27 : i64) start_idx(%27 : i64) + // CHECK: %31 = omp.map_info var_ptr(%[[VAL_1]] : !llvm.ptr>) map_clauses(from) capture(ByRef) bounds(%30) -> !llvm.ptr> {name = "a"} + // CHECK: %32 = llvm.mlir.constant(1 : index) : i64 + // CHECK: %33 = llvm.mlir.constant(0 : index) : i64 + // CHECK: %34 = llvm.mlir.constant(1023 : index) : i64 + // CHECK: %35 = omp.bounds lower_bound(%33 : i64) upper_bound(%34 : i64) extent(%3 : i64) stride(%32 : i64) start_idx(%32 : i64) + // CHECK: %36 = omp.map_info var_ptr(%[[VAL_3]] : !llvm.ptr>) map_clauses(from) capture(ByRef) bounds(%35) -> !llvm.ptr> {name = "b"} + // CHECK: %37 = llvm.mlir.constant(1 : index) : i64 + // CHECK: %38 = llvm.mlir.constant(0 : index) : i64 + // CHECK: %39 = llvm.mlir.constant(1023 : index) : i64 + // CHECK: %40 = omp.bounds lower_bound(%38 : i64) upper_bound(%39 : i64) extent(%6 : i64) stride(%37 : i64) start_idx(%37 : i64) + // CHECK: %41 = omp.map_info var_ptr(%[[VAL_5]] : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%40) -> !llvm.ptr> {name = "c"} + // CHECK: %42 = llvm.mlir.constant(1 : index) : i64 + // CHECK: %43 = llvm.mlir.constant(0 : index) : i64 + // CHECK: %44 = llvm.mlir.constant(1023 : index) : i64 + // CHECK: %45 = omp.bounds lower_bound(%43 : i64) upper_bound(%44 : i64) extent(%9 : i64) stride(%42 : i64) start_idx(%42 : i64) + // CHECK: %46 = omp.map_info var_ptr(%[[VAL_7]] : !llvm.ptr>) map_clauses(always, delete) capture(ByRef) bounds(%45) -> !llvm.ptr> {name = "d"} + // CHECK: omp.target_exit_data map_entries(%31, %36, %41, %46 : !llvm.ptr>, !llvm.ptr>, !llvm.ptr>, !llvm.ptr>) + // CHECK: llvm.return + // CHECK: } // ----- func.func @_QPopenmp_target_data_region() { %0 = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFopenmp_target_data_regionEa"} %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFopenmp_target_data_regionEi"} - omp.target_data map((tofrom -> %0 : !fir.ref>)) { + %c1024 = arith.constant 1024 : index + %c3 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c2 = arith.subi %c1024, %c3 : index + %bound = omp.bounds lower_bound(%c0 : index) upper_bound(%c2 : index) extent(%c1024 : index) stride(%c3 : index) start_idx(%c3 : index) + %entry = omp.map_info var_ptr(%0 : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%bound) -> !fir.ref> {name = "a"} + omp.target_data map_entries(%entry : !fir.ref>) { %c1_i32 = arith.constant 1 : i32 %2 = fir.convert %c1_i32 : (i32) -> index %c1024_i32 = arith.constant 1024 : i32 @@ -281,7 +365,13 @@ func.func @_QPopenmp_target_data_region() { // CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a", in_type = !fir.array<1024xi32>, operandSegmentSizes = array, uniq_name = "_QFopenmp_target_data_regionEa"} : (i64) -> !llvm.ptr> // CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array, uniq_name = "_QFopenmp_target_data_regionEi"} : (i64) -> !llvm.ptr -// CHECK: omp.target_data map((tofrom -> %[[VAL_1]] : !llvm.ptr>)) { +// CHECK: %[[VAL_MAX:.*]] = llvm.mlir.constant(1024 : index) : i64 +// CHECK: %[[VAL_ONE:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[VAL_ZERO:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: %[[VAL_UPPER:.*]] = llvm.mlir.constant(1023 : index) : i64 +// CHECK: %[[VAL_BOUNDS:.*]] = omp.bounds lower_bound(%[[VAL_ZERO]] : i64) upper_bound(%[[VAL_UPPER]] : i64) extent(%[[VAL_MAX]] : i64) stride(%[[VAL_ONE]] : i64) start_idx(%[[VAL_ONE]] : i64) +// CHECK: %[[VAL_MAP:.*]] = omp.map_info var_ptr(%[[VAL_1]] : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_BOUNDS]]) -> !llvm.ptr> {name = "a"} +// CHECK: omp.target_data map_entries(%[[VAL_MAP]] : !llvm.ptr>) { // CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_5:.*]] = llvm.sext %[[VAL_4]] : i32 to i64 // CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1024 : i32) : i32 @@ -335,25 +425,37 @@ func.func @_QPomp_target_data_empty() { // ----- func.func @_QPomp_target() { + %c512 = arith.constant 512 : index %0 = fir.alloca !fir.array<512xi32> {bindc_name = "a", uniq_name = "_QFomp_targetEa"} %c64_i32 = arith.constant 64 : i32 - omp.target thread_limit(%c64_i32 : i32) map((tofrom -> %0 : !fir.ref>)) { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %1 = arith.subi %c512, %c1 : index + %2 = omp.bounds lower_bound(%c0 : index) upper_bound(%1 : index) extent(%c512 : index) stride(%c1 : index) start_idx(%c1 : index) + %3 = omp.map_info var_ptr(%0 : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%2) -> !fir.ref> {name = "a"} + omp.target thread_limit(%c64_i32 : i32) map_entries(%3 : !fir.ref>) { %c10_i32 = arith.constant 10 : i32 %c1_i64 = arith.constant 1 : i64 %c1_i64_0 = arith.constant 1 : i64 - %1 = arith.subi %c1_i64, %c1_i64_0 : i64 - %2 = fir.coordinate_of %0, %1 : (!fir.ref>, i64) -> !fir.ref - fir.store %c10_i32 to %2 : !fir.ref + %4 = arith.subi %c1_i64, %c1_i64_0 : i64 + %5 = fir.coordinate_of %0, %4 : (!fir.ref>, i64) -> !fir.ref + fir.store %c10_i32 to %5 : !fir.ref omp.terminator } return } // CHECK-LABEL: llvm.func @_QPomp_target() { +// CHECK: %[[EXTENT:.*]] = llvm.mlir.constant(512 : index) : i64 // CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<512 x i32> {bindc_name = "a", in_type = !fir.array<512xi32>, operandSegmentSizes = array, uniq_name = "_QFomp_targetEa"} : (i64) -> !llvm.ptr> // CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(64 : i32) : i32 -// CHECK: omp.target thread_limit(%[[VAL_2]] : i32) map((tofrom -> %[[VAL_1]] : !llvm.ptr>)) { +// CHECK: %[[STRIDE:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[LOWER:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: %[[UPPER:.*]] = llvm.mlir.constant(511 : index) : i64 +// CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound(%[[LOWER]] : i64) upper_bound(%[[UPPER]] : i64) extent(%[[EXTENT]] : i64) stride(%[[STRIDE]] : i64) start_idx(%[[STRIDE]] : i64) +// CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%2 : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !llvm.ptr> {name = "a"} +// CHECK: omp.target thread_limit(%[[VAL_2]] : i32) map_entries(%[[MAP]] : !llvm.ptr>) { // CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(10 : i32) : i32 // CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[VAL_5:.*]] = llvm.mlir.constant(1 : i64) : i64 diff --git a/flang/test/Intrinsics/math-codegen.fir b/flang/test/Intrinsics/math-codegen.fir index 0af896adf3226..62d841253075e 100644 --- a/flang/test/Intrinsics/math-codegen.fir +++ b/flang/test/Intrinsics/math-codegen.fir @@ -1467,9 +1467,15 @@ func.func private @llvm.powi.f64.i32(f64, i32) -> f64 func.func private @pow(f64, f64) -> f64 //--- exponentiation_integer.fir -// RUN: fir-opt %t/exponentiation_integer.fir --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" | FileCheck %t/exponentiation_integer.fir +// RUN: fir-opt %t/exponentiation_integer.fir --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" | FileCheck %t/exponentiation_integer.fir --check-prefixes="CHECK,CHECK-COMDAT" +// RUN: fir-opt %t/exponentiation_integer.fir --fir-to-llvm-ir="target=x86_64-pc-windows-msvc" | FileCheck %t/exponentiation_integer.fir --check-prefixes="CHECK,CHECK-COMDAT" +// RUN: fir-opt %t/exponentiation_integer.fir --fir-to-llvm-ir="target=aarch64-apple-darwin" | FileCheck %t/exponentiation_integer.fir --check-prefixes="CHECK,CHECK-NOCOMDAT" +// CHECK-COMDAT: llvm.comdat_selector @__mlir_math_ipowi_i32 any +// CHECK-NOCOMDAT-NOT: llvm.comdat_selector @__mlir_math_ipowi_i32 any // CHECK: @_QPtest_int4 // CHECK: llvm.call @__mlir_math_ipowi_i32({{%[A-Za-z0-9._]+}}, {{%[A-Za-z0-9._]+}}) : (i32, i32) -> i32 +// CHECK-COMDAT: llvm.func linkonce_odr @__mlir_math_ipowi_i32(%arg0: i32, %arg1: i32) -> i32 comdat(@__llvm_comdat::@__mlir_math_ipowi_i32) +// CHECK-NOCOMDAT: llvm.func linkonce_odr @__mlir_math_ipowi_i32(%arg0: i32, %arg1: i32) -> i32 func.func @_QPtest_int4(%arg0: !fir.ref {fir.bindc_name = "x"}, %arg1: !fir.ref {fir.bindc_name = "y"}, %arg2: !fir.ref {fir.bindc_name = "z"}) { %0 = fir.load %arg0 : !fir.ref diff --git a/flang/test/Lower/OpenMP/array-bounds.f90 b/flang/test/Lower/OpenMP/array-bounds.f90 new file mode 100644 index 0000000000000..697ff44176b3f --- /dev/null +++ b/flang/test/Lower/OpenMP/array-bounds.f90 @@ -0,0 +1,149 @@ +!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes HOST +!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes DEVICE + +!DEVICE-LABEL: func.func @_QPread_write_section_omp_outline_0( +!DEVICE-SAME: %[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref>, %[[ARG2:.*]]: !fir.ref>) attributes {omp.declare_target = #omp.declaretarget, omp.outline_parent_name = "_QPread_write_section"} { +!DEVICE: %[[C1:.*]] = arith.constant 1 : index +!DEVICE: %[[C2:.*]] = arith.constant 4 : index +!DEVICE: %[[C3:.*]] = arith.constant 1 : index +!DEVICE: %[[C4:.*]] = arith.constant 1 : index +!DEVICE: %[[BOUNDS0:.*]] = omp.bounds lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index) +!DEVICE: %[[MAP0:.*]] = omp.map_info var_ptr(%[[ARG1]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref> {name = "sp_read(2:5)"} +!DEVICE: %[[C5:.*]] = arith.constant 1 : index +!DEVICE: %[[C6:.*]] = arith.constant 4 : index +!DEVICE: %[[C7:.*]] = arith.constant 1 : index +!DEVICE: %[[C8:.*]] = arith.constant 1 : index +!DEVICE: %[[BOUNDS1:.*]] = omp.bounds lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) stride(%[[C8]] : index) start_idx(%[[C8]] : index) +!DEVICE: %[[MAP1:.*]] = omp.map_info var_ptr(%[[ARG2]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref> {name = "sp_write(2:5)"} +!DEVICE: omp.target map_entries(%[[MAP0]], %[[MAP1]] : !fir.ref>, !fir.ref>) { + +!HOST-LABEL: func.func @_QPread_write_section() { +!HOST: %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFread_write_sectionEi"} +!HOST: %[[READ:.*]] = fir.address_of(@_QFread_write_sectionEsp_read) : !fir.ref> +!HOST: %[[WRITE:.*]] = fir.address_of(@_QFread_write_sectionEsp_write) : !fir.ref> +!HOST: %[[C1:.*]] = arith.constant 1 : index +!HOST: %[[C2:.*]] = arith.constant 1 : index +!HOST: %[[C3:.*]] = arith.constant 4 : index +!HOST: %[[BOUNDS0:.*]] = omp.bounds lower_bound(%[[C2]] : index) upper_bound(%[[C3]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index) +!HOST: %[[MAP0:.*]] = omp.map_info var_ptr(%[[READ]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref> {name = "sp_read(2:5)"} +!HOST: %[[C4:.*]] = arith.constant 1 : index +!HOST: %[[C5:.*]] = arith.constant 1 : index +!HOST: %[[C6:.*]] = arith.constant 4 : index +!HOST: %[[BOUNDS1:.*]] = omp.bounds lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index) +!HOST: %[[MAP1:.*]] = omp.map_info var_ptr(%[[WRITE]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref> {name = "sp_write(2:5)"} +!HOST: omp.target map_entries(%[[MAP0]], %[[MAP1]] : !fir.ref>, !fir.ref>) { + +subroutine read_write_section() + integer :: sp_read(10) = (/1,2,3,4,5,6,7,8,9,10/) + integer :: sp_write(10) = (/0,0,0,0,0,0,0,0,0,0/) + +!$omp target map(tofrom:sp_read(2:5)) map(tofrom:sp_write(2:5)) + do i = 2, 5 + sp_write(i) = sp_read(i) + end do +!$omp end target +end subroutine read_write_section + + +module assumed_array_routines + contains +!DEVICE-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array_omp_outline_0( +!DEVICE-SAME: %[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.box>) attributes {omp.declare_target = #omp.declaretarget, omp.outline_parent_name = "_QMassumed_array_routinesPassumed_shape_array"} { +!DEVICE: %[[C0:.*]] = arith.constant 1 : index +!DEVICE: %[[C1:.*]] = arith.constant 4 : index +!DEVICE: %[[C2:.*]] = arith.constant 0 : index +!DEVICE: %[[C3:.*]]:3 = fir.box_dims %[[ARG1]], %[[C2]] : (!fir.box>, index) -> (index, index, index) +!DEVICE: %[[C4:.*]] = arith.constant 1 : index +!DEVICE: %[[BOUNDS:.*]] = omp.bounds lower_bound(%[[C0]] : index) upper_bound(%[[C1]] : index) stride(%[[C3]]#2 : index) start_idx(%[[C4]] : index) {stride_in_bytes = true} +!DEVICE: %[[ARGADDR:.*]] = fir.box_addr %[[ARG1]] : (!fir.box>) -> !fir.ref> +!DEVICE: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARGADDR]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "arr_read_write(2:5)"} +!DEVICE: omp.target map_entries(%[[MAP]] : !fir.ref>) { + +!HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array( +!HOST-SAME: %[[ARG0:.*]]: !fir.box> {fir.bindc_name = "arr_read_write"}) { +!HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEi"} +!HOST: %[[C0:.*]] = arith.constant 1 : index +!HOST: %[[C1:.*]] = arith.constant 0 : index +!HOST: %[[C2:.*]]:3 = fir.box_dims %arg0, %[[C1]] : (!fir.box>, index) -> (index, index, index) +!HOST: %[[C3:.*]] = arith.constant 1 : index +!HOST: %[[C4:.*]] = arith.constant 4 : index +!HOST: %[[BOUNDS:.*]] = omp.bounds lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) stride(%[[C2]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true} +!HOST: %[[ADDROF:.*]] = fir.box_addr %arg0 : (!fir.box>) -> !fir.ref> +!HOST: %[[MAP:.*]] = omp.map_info var_ptr(%[[ADDROF]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "arr_read_write(2:5)"} +!HOST: omp.target map_entries(%[[MAP]] : !fir.ref>) { + subroutine assumed_shape_array(arr_read_write) + integer, intent(inout) :: arr_read_write(:) + + !$omp target map(tofrom:arr_read_write(2:5)) + do i = 2, 5 + arr_read_write(i) = i + end do + !$omp end target + end subroutine assumed_shape_array + +!DEVICE-LABEL: func.func @_QMassumed_array_routinesPassumed_size_array_omp_outline_0( +!DEVICE-SAME: %[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref>) attributes {omp.declare_target = #omp.declaretarget, omp.outline_parent_name = "_QMassumed_array_routinesPassumed_size_array"} { +!DEVICE: %[[C0:.*]] = arith.constant 1 : index +!DEVICE: %[[C1:.*]] = arith.constant 4 : index +!DEVICE: %[[C2:.*]] = arith.constant 1 : index +!DEVICE: %[[C3:.*]] = arith.constant 1 : index +!DEVICE: %[[BOUNDS:.*]] = omp.bounds lower_bound(%[[C0]] : index) upper_bound(%[[C1]] : index) stride(%[[C3]] : index) start_idx(%[[C3]] : index) +!DEVICE: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG1]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "arr_read_write(2:5)"} +!DEVICE: omp.target map_entries(%[[MAP]] : !fir.ref>) { + +!HOST-LABEL: func.func @_QMassumed_array_routinesPassumed_size_array( +!HOST-SAME: %[[ARG0:.*]]: !fir.ref> {fir.bindc_name = "arr_read_write"}) { +!HOST: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"} +!HOST: %[[C0:.*]] = arith.constant 1 : index +!HOST: %[[C1:.*]] = arith.constant 1 : index +!HOST: %[[C2:.*]] = arith.constant 4 : index +!HOST: %[[BOUNDS:.*]] = omp.bounds lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) stride(%[[C0]] : index) start_idx(%[[C0]] : index) +!HOST: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG0]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "arr_read_write(2:5)"} +!HOST: omp.target map_entries(%[[MAP]] : !fir.ref>) { + subroutine assumed_size_array(arr_read_write) + integer, intent(inout) :: arr_read_write(*) + + !$omp target map(tofrom:arr_read_write(2:5)) + do i = 2, 5 + arr_read_write(i) = i + end do + !$omp end target + end subroutine assumed_size_array + end module assumed_array_routines + + +!HOST-LABEL:func.func @_QPcall_assumed_shape_and_size_array() { +!HOST:%{{.*}} = arith.constant 20 : index +!HOST:%[[ALLOCA:.*]] = fir.alloca !fir.array<20xi32> {bindc_name = "arr_read_write", uniq_name = "_QFcall_assumed_shape_and_size_arrayEarr_read_write"} +!HOST:%{{.*}} = arith.constant 1 : i64 +!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index +!HOST:%{{.*}} = arith.constant 1 : i64 +!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index +!HOST:%{{.*}} = arith.constant 10 : i64 +!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index +!HOST:%[[SHAPE0:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> +!HOST:%[[SLICE0:.*]] = fir.slice %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.slice<1> +!HOST:%[[ARG0EMB:.*]] = fir.embox %[[ALLOCA]](%[[SHAPE0]]) [%[[SLICE0]]] : (!fir.ref>, !fir.shape<1>, !fir.slice<1>) -> !fir.box> +!HOST:%[[ARG0:.*]] = fir.convert %[[ARG0EMB]] : (!fir.box>) -> !fir.box> +!HOST:fir.call @_QMassumed_array_routinesPassumed_shape_array(%[[ARG0]]) fastmath : (!fir.box>) -> () +!HOST:%{{.*}} = arith.constant 10 : i64 +!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index +!HOST:%{{.*}} = arith.constant 1 : i64 +!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index +!HOST:%{{.*}} = arith.constant 20 : i64 +!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index +!HOST:%[[SHAPE1:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1> +!HOST:%[[SLICE1:.*]] = fir.slice %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.slice<1> +!HOST:%[[ARG1EMB:.*]] = fir.embox %[[ALLOCA]](%[[SHAPE1]]) [%[[SLICE1]]] : (!fir.ref>, !fir.shape<1>, !fir.slice<1>) -> !fir.box> +!HOST:%[[ADDROF:.*]] = fir.box_addr %[[ARG1EMB]] : (!fir.box>) -> !fir.ref> +!HOST:%[[ARG1:.*]] = fir.convert %[[ADDROF]] : (!fir.ref>) -> !fir.ref> +!HOST:fir.call @_QMassumed_array_routinesPassumed_size_array(%[[ARG1]]) fastmath : (!fir.ref>) -> () +!HOST:return +!HOST:} + +subroutine call_assumed_shape_and_size_array + use assumed_array_routines + integer :: arr_read_write(20) + call assumed_shape_array(arr_read_write(1:10)) + call assumed_size_array(arr_read_write(10:20)) +end subroutine call_assumed_shape_and_size_array diff --git a/flang/test/Lower/OpenMP/location.f90 b/flang/test/Lower/OpenMP/location.f90 index 442ca44c93da2..0e36e09b19e19 100644 --- a/flang/test/Lower/OpenMP/location.f90 +++ b/flang/test/Lower/OpenMP/location.f90 @@ -17,7 +17,7 @@ subroutine sub_parallel() !CHECK-LABEL: sub_target subroutine sub_target() print *, x -!CHECK: omp.target {{.*}} { +!CHECK: omp.target { !$omp target print *, x !CHECK: omp.terminator loc(#[[TAR_LOC:.*]]) diff --git a/flang/test/Lower/OpenMP/omp-target-early-outlining.f90 b/flang/test/Lower/OpenMP/omp-target-early-outlining.f90 index d162566222fb1..eac19f889f433 100644 --- a/flang/test/Lower/OpenMP/omp-target-early-outlining.f90 +++ b/flang/test/Lower/OpenMP/omp-target-early-outlining.f90 @@ -8,7 +8,8 @@ !CHECK: func.func @_QPtarget_function !CHECK: func.func @_QPwrite_index_omp_outline_0(%[[ARG0:.*]]: !fir.ref) attributes {omp.declare_target = #omp.declaretarget, omp.outline_parent_name = "_QPwrite_index"} { -!CHECK-NEXT: omp.target {{.*}} { +!CHECK-NEXT: %[[map_info0:.*]] = omp.map_info var_ptr(%[[ARG0]]{{.*}} +!CHECK-NEXT: omp.target map_entries(%[[map_info0]]{{.*}} { !CHECK: %[[CONSTANT_VALUE_10:.*]] = arith.constant 10 : i32 !CHECK: fir.store %[[CONSTANT_VALUE_10]] to %[[ARG0]] : !fir.ref !CHECK: omp.terminator @@ -16,7 +17,8 @@ !CHECK-NEXT: return !CHECK: func.func @_QPwrite_index_omp_outline_1(%[[ARG1:.*]]: !fir.ref) attributes {omp.declare_target = #omp.declaretarget, omp.outline_parent_name = "_QPwrite_index"} { -!CHECK-NEXT: omp.target {{.*}} { +!CHECK-NEXT: %[[map_info1:.*]] = omp.map_info var_ptr(%[[ARG1]]{{.*}} +!CHECK-NEXT: omp.target map_entries(%[[map_info1]]{{.*}} { !CHECK: %[[CONSTANT_VALUE_20:.*]] = arith.constant 20 : i32 !CHECK: fir.store %[[CONSTANT_VALUE_20]] to %[[ARG1]] : !fir.ref !CHECK: omp.terminator @@ -41,3 +43,47 @@ end subroutine WRITE_INDEX SUBROUTINE TARGET_FUNCTION() !$omp declare target END + +!CHECK: func.func @_QParray_bounds_omp_outline_0(%[[ARG0:.*]]: !fir.ref, %[[ARG1:.*]]: !fir.ref>) attributes {omp.declare_target = #omp.declaretarget, omp.outline_parent_name = "_QParray_bounds"} { +!CHECK: %[[C1:.*]] = arith.constant 1 : index +!CHECK: %[[C4:.*]] = arith.constant 4 : index +!CHECK: %[[C1_0:.*]] = arith.constant 1 : index +!CHECK: %[[C1_1:.*]] = arith.constant 1 : index +!CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound(%[[C1]] : index) upper_bound(%[[C4]] : index) stride(%[[C1_1]] : index) start_idx(%[[C1_1]] : index) +!CHECK: %[[ENTRY:.*]] = omp.map_info var_ptr(%[[ARG1]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "sp_write(2:5)"} +!CHECK: omp.target map_entries(%[[ENTRY]] : !fir.ref>) { +!CHECK: %c2_i32 = arith.constant 2 : i32 +!CHECK: %2 = fir.convert %c2_i32 : (i32) -> index +!CHECK: %c5_i32 = arith.constant 5 : i32 +!CHECK: %3 = fir.convert %c5_i32 : (i32) -> index +!CHECK: %c1_2 = arith.constant 1 : index +!CHECK: %4 = fir.convert %2 : (index) -> i32 +!CHECK: %5:2 = fir.do_loop %arg2 = %2 to %3 step %c1_2 iter_args(%arg3 = %4) -> (index, i32) { +!CHECK: fir.store %arg3 to %[[ARG0]] : !fir.ref +!CHECK: %6 = fir.load %[[ARG0]] : !fir.ref +!CHECK: %7 = fir.load %[[ARG0]] : !fir.ref +!CHECK: %8 = fir.convert %7 : (i32) -> i64 +!CHECK: %c1_i64 = arith.constant 1 : i64 +!CHECK: %9 = arith.subi %8, %c1_i64 : i64 +!CHECK: %10 = fir.coordinate_of %[[ARG1]], %9 : (!fir.ref>, i64) -> !fir.ref +!CHECK: fir.store %6 to %10 : !fir.ref +!CHECK: %11 = arith.addi %arg2, %c1_2 : index +!CHECK: %12 = fir.convert %c1_2 : (index) -> i32 +!CHECK: %13 = fir.load %[[ARG0]] : !fir.ref +!CHECK: %14 = arith.addi %13, %12 : i32 +!CHECK: fir.result %11, %14 : index, i32 +!CHECK: } +!CHECK: fir.store %5#1 to %[[ARG0]] : !fir.ref +!CHECK: omp.terminator +!CHECK: } +!CHECK:return +!CHECK:} + +SUBROUTINE ARRAY_BOUNDS() + INTEGER :: sp_write(10) = (/0,0,0,0,0,0,0,0,0,0/) +!$omp target map(tofrom:sp_write(2:5)) + do i = 2, 5 + sp_write(i) = i + end do +!$omp end target +end subroutine ARRAY_BOUNDS diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 19618c98913d7..9b1fb5c15ac1d 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -7,7 +7,9 @@ !CHECK-LABEL: func.func @_QPomp_target_enter_simple() { subroutine omp_target_enter_simple integer :: a(1024) - !CHECK: omp.target_enter_data map((to -> {{.*}} : !fir.ref>)) + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_enter_data map_entries(%[[MAP]] : !fir.ref>) !$omp target enter data map(to: a) end subroutine omp_target_enter_simple @@ -21,7 +23,15 @@ subroutine omp_target_enter_mt integer :: b(1024) integer :: c(1024) integer :: d(1024) - !CHECK: omp.target_enter_data map((to -> {{.*}} : !fir.ref>), (to -> {{.*}} : !fir.ref>), (always, alloc -> {{.*}} : !fir.ref>), (to -> {{.*}} : !fir.ref>)) + !CHECK: %[[BOUNDS_0:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_0:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_0]]) -> !fir.ref> {name = "a"} + !CHECK: %[[BOUNDS_1:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_1:.*]] = omp.map_info var_ptr(%{{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref> {name = "b"} + !CHECK: %[[BOUNDS_2:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_2:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref> {name = "c"} + !CHECK: %[[BOUNDS_3:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_3:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref> {name = "d"} + !CHECK: omp.target_enter_data map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]] : !fir.ref>, !fir.ref>, !fir.ref>, !fir.ref>) !$omp target enter data map(to: a, b) map(always, alloc: c) map(to: d) end subroutine omp_target_enter_mt @@ -32,7 +42,9 @@ end subroutine omp_target_enter_mt !CHECK-LABEL: func.func @_QPomp_target_enter_nowait() { subroutine omp_target_enter_nowait integer :: a(1024) - !CHECK: omp.target_enter_data nowait map((to -> {{.*}} : !fir.ref>)) + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_enter_data nowait map_entries(%[[MAP]] : !fir.ref>) !$omp target enter data map(to: a) nowait end subroutine omp_target_enter_nowait @@ -48,7 +60,9 @@ subroutine omp_target_enter_if !CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_1:.*]] : !fir.ref !CHECK: %[[VAL_4:.*]] = arith.constant 10 : i32 !CHECK: %[[VAL_5:.*]] = arith.cmpi slt, %[[VAL_3]], %[[VAL_4]] : i32 - !CHECK: omp.target_enter_data if(%[[VAL_5]] : i1) map((to -> {{.*}} : !fir.ref>)) + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_enter_data if(%[[VAL_5]] : i1) map_entries(%[[MAP]] : !fir.ref>) !$omp target enter data if(i<10) map(to: a) end subroutine omp_target_enter_if @@ -60,7 +74,9 @@ end subroutine omp_target_enter_if subroutine omp_target_enter_device integer :: a(1024) !CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 - !CHECK: omp.target_enter_data device(%[[VAL_1]] : i32) map((to -> {{.*}} : !fir.ref>)) + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_enter_data device(%[[VAL_1]] : i32) map_entries(%[[MAP]] : !fir.ref>) !$omp target enter data map(to: a) device(2) end subroutine omp_target_enter_device @@ -71,7 +87,9 @@ end subroutine omp_target_enter_device !CHECK-LABEL: func.func @_QPomp_target_exit_simple() { subroutine omp_target_exit_simple integer :: a(1024) - !CHECK: omp.target_exit_data map((from -> {{.*}} : !fir.ref>)) + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_exit_data map_entries(%[[MAP]] : !fir.ref>) !$omp target exit data map(from: a) end subroutine omp_target_exit_simple @@ -86,7 +104,17 @@ subroutine omp_target_exit_mt integer :: c(1024) integer :: d(1024) integer :: e(1024) - !CHECK: omp.target_exit_data map((from -> {{.*}} : !fir.ref>), (from -> {{.*}} : !fir.ref>), (release -> {{.*}} : !fir.ref>), (always, delete -> {{.*}} : !fir.ref>), (from -> {{.*}} : !fir.ref>)) + !CHECK: %[[BOUNDS_0:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_0:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_0]]) -> !fir.ref> {name = "a"} + !CHECK: %[[BOUNDS_1:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_1:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref> {name = "b"} + !CHECK: %[[BOUNDS_2:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_2:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref> {name = "c"} + !CHECK: %[[BOUNDS_3:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_3:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(always, delete) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref> {name = "d"} + !CHECK: %[[BOUNDS_4:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_4:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_4]]) -> !fir.ref> {name = "e"} + !CHECK: omp.target_exit_data map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]], %[[MAP_4]] : !fir.ref>, !fir.ref>, !fir.ref>, !fir.ref>, !fir.ref>) !$omp target exit data map(from: a,b) map(release: c) map(always, delete: d) map(from: e) end subroutine omp_target_exit_mt @@ -99,7 +127,9 @@ subroutine omp_target_exit_device integer :: a(1024) integer :: d !CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1:.*]] : !fir.ref - !CHECK: omp.target_exit_data device(%[[VAL_2]] : i32) map((from -> {{.*}} : !fir.ref>)) + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_exit_data device(%[[VAL_2]] : i32) map_entries(%[[MAP]] : !fir.ref>) !$omp target exit data map(from: a) device(d) end subroutine omp_target_exit_device @@ -111,7 +141,9 @@ end subroutine omp_target_exit_device subroutine omp_target_data !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_dataEa"} integer :: a(1024) - !CHECK: omp.target_data map((tofrom -> %[[VAL_0]] : !fir.ref>)) { + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_0]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_data map_entries(%[[MAP]] : !fir.ref>) { !$omp target data map(tofrom: a) !CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32 !CHECK: %[[VAL_2:.*]] = arith.constant 1 : i64 @@ -131,12 +163,16 @@ subroutine omp_target_data_mt integer :: b(1024) !CHECK: %[[VAR_A:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_data_mtEa"} !CHECK: %[[VAR_B:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "b", uniq_name = "_QFomp_target_data_mtEb"} - !CHECK: omp.target_data map((tofrom -> %[[VAR_A]] : !fir.ref>)) + !CHECK: %[[BOUNDS_A:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAR_A]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_A]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_data map_entries(%[[MAP_A]] : !fir.ref>) { !$omp target data map(a) !CHECK: omp.terminator !$omp end target data !CHECK: } - !CHECK: omp.target_data map((always, from -> %[[VAR_B]] : !fir.ref>)) + !CHECK: %[[BOUNDS_B:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP_B:.*]] = omp.map_info var_ptr(%[[VAR_B]] : !fir.ref>) map_clauses(always, from) capture(ByRef) bounds(%[[BOUNDS_B]]) -> !fir.ref> {name = "b"} + !CHECK: omp.target_data map_entries(%[[MAP_B]] : !fir.ref>) { !$omp target data map(always, from : b) !CHECK: omp.terminator !$omp end target data @@ -151,7 +187,9 @@ end subroutine omp_target_data_mt subroutine omp_target !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_targetEa"} integer :: a(1024) - !CHECK: omp.target map((tofrom -> %[[VAL_0]] : !fir.ref>)) { + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_0]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target map_entries(%[[MAP]] : !fir.ref>) { !$omp target map(tofrom: a) !CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32 !CHECK: %[[VAL_2:.*]] = arith.constant 1 : i64 @@ -173,7 +211,8 @@ end subroutine omp_target subroutine omp_target_thread_limit integer :: a !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32 - !CHECK: omp.target thread_limit(%[[VAL_1]] : i32) map((tofrom -> %[[VAL_0]] : !fir.ref)) { + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "a"} + !CHECK: omp.target thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] : !fir.ref) { !$omp target map(tofrom: a) thread_limit(64) a = 10 !CHECK: omp.terminator @@ -190,7 +229,8 @@ subroutine omp_target_device_ptr use iso_c_binding, only : c_ptr, c_loc type(c_ptr) :: a integer, target :: b - !CHECK: omp.target_data map((tofrom -> %[[VAL_0:.*]] : !fir.ref>)) use_device_ptr(%[[VAL_0]] : !fir.ref>) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"} + !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}} !$omp target data map(tofrom: a) use_device_ptr(a) !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref>): !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], {{.*}} : (!fir.ref>, !fir.field) -> !fir.ref @@ -207,7 +247,9 @@ end subroutine omp_target_device_ptr !CHECK-LABEL: func.func @_QPomp_target_device_addr() { subroutine omp_target_device_addr integer, pointer :: a - !CHECK: omp.target_data map((tofrom -> %[[VAL_0:.*]] : !fir.ref>>)) use_device_addr(%[[VAL_0]] : !fir.ref>>) + !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box> {bindc_name = "a", uniq_name = "_QFomp_target_device_addrEa"} + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr({{.*}}) map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"} + !CHECK: omp.target_data map_entries(%[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0]] : !fir.ref>>) { !$omp target data map(tofrom: a) use_device_addr(a) !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref>>): !CHECK: {{.*}} = fir.load %[[VAL_1]] : !fir.ref>> @@ -223,11 +265,17 @@ end subroutine omp_target_device_addr !CHECK-LABEL: func.func @_QPomp_target_parallel_do() { subroutine omp_target_parallel_do + !CHECK: %[[C1024:.*]] = arith.constant 1024 : index !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_parallel_doEa"} integer :: a(1024) !CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_target_parallel_doEi"} integer :: i - !CHECK: omp.target map((tofrom -> %[[VAL_0]] : !fir.ref>)) { + !CHECK: %[[C1:.*]] = arith.constant 1 : index + !CHECK: %[[C0:.*]] = arith.constant 0 : index + !CHECK: %[[SUB:.*]] = arith.subi %[[C1024]], %[[C1]] : index + !CHECK: %[[BOUNDS:.*]] = omp.bounds lower_bound(%[[C0]] : index) upper_bound(%[[SUB]] : index) extent(%[[C1024]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index) + !CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[VAL_0]] : !fir.ref>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target map_entries(%[[MAP]] : !fir.ref>) { !CHECK-NEXT: omp.parallel !$omp target parallel do map(tofrom: a) !CHECK: %[[VAL_2:.*]] = fir.alloca i32 {adapt.valuebyref, pinned} @@ -237,7 +285,7 @@ subroutine omp_target_parallel_do !CHECK: omp.wsloop for (%[[VAL_6:.*]]) : i32 = (%[[VAL_3]]) to (%[[VAL_4]]) inclusive step (%[[VAL_5]]) { !CHECK: fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref !CHECK: %[[VAL_7:.*]] = arith.constant 10 : i32 - !CHECK: %[[VAL_8:.*]] = fir.load %2 : !fir.ref + !CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_2]] : !fir.ref !CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i32) -> i64 !CHECK: %[[VAL_10:.*]] = arith.constant 1 : i64 !CHECK: %[[VAL_11:.*]] = arith.subi %[[VAL_9]], %[[VAL_10]] : i64 diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 2e57275e10bc4..fd2cb9351419c 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -78,7 +78,7 @@ add_gen_header( ) # TODO: This should be conditional on POSIX networking being included. -file(MAKE_DIRECTORY "arpa") +file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/arpa) add_gen_header( arpa_inet @@ -280,7 +280,7 @@ add_gen_header( # TODO: Not all platforms will have a include/sys directory. Add the sys # directory and the targets for sys/*.h files conditional to the OS requiring # them. -file(MAKE_DIRECTORY "sys") +file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/sys) add_gen_header( sys_auxv @@ -497,7 +497,7 @@ add_gen_header( ) if(LIBC_TARGET_ARCHITECTURE_IS_GPU) - file(MAKE_DIRECTORY "gpu") + file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu) add_gen_header( gpu_rpc diff --git a/libc/src/math/docs/add_math_function.md b/libc/src/math/docs/add_math_function.md index cf4d31661daf3..9efe0620f5d7d 100644 --- a/libc/src/math/docs/add_math_function.md +++ b/libc/src/math/docs/add_math_function.md @@ -88,12 +88,17 @@ Besides the usual testing macros like `EXPECT_EQ, ASSERT_TRUE, ...` there are testing macros specifically used for floating point values, such as `EXPECT_FP_EQ, ASSERT_FP_LE, ...` -- Add unit test to: +- Add smoke tests (simple cases and zeros / inf / nan inputs or outputs) to: +``` + libc/test/src/math/smoke/_test.cpp +``` +- Add unit test that might require MPFR to: ``` libc/test/src/math/_test.cpp ``` -- Add the corresponding entry point to: +- Add the corresponding entry points to: ``` + libc/test/src/math/smoke/CMakeLists.txt libc/test/src/math/CMakeLists.txt ``` @@ -160,6 +165,16 @@ implementation (which is very often glibc). $ ninja check-libc ``` +- Run math smoke tests only: +``` + $ ninja libc-math-smoke-tests +``` + +- Run math smoke and unit tests: +``` + $ ninja libc-math-unittests +``` + - Build and Run a specific unit test: ``` $ ninja libc.test.src.math._test diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 04b1256e09b46..d47e0e0ea4aad 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1,4 +1,4 @@ -add_custom_target(libc_math_unittests) +add_custom_target(libc-math-unittests) add_library( libc_math_test_utils @@ -10,7 +10,7 @@ add_fp_unittest( cosf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS cosf_test.cpp HDRS @@ -26,7 +26,7 @@ add_fp_unittest( cos_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS cos_test.cpp DEPENDS @@ -38,7 +38,7 @@ add_fp_unittest( sinf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS sinf_test.cpp HDRS @@ -54,7 +54,7 @@ add_fp_unittest( sin_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS sin_test.cpp DEPENDS @@ -66,7 +66,7 @@ add_fp_unittest( sincosf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS sincosf_test.cpp HDRS @@ -82,7 +82,7 @@ add_fp_unittest( tanf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS tanf_test.cpp HDRS @@ -98,7 +98,7 @@ add_fp_unittest( fabs_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS fabs_test.cpp HDRS @@ -113,7 +113,7 @@ add_fp_unittest( fabsf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS fabsf_test.cpp HDRS @@ -128,7 +128,7 @@ add_fp_unittest( fabsl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS fabsl_test.cpp HDRS @@ -143,7 +143,7 @@ add_fp_unittest( trunc_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS trunc_test.cpp HDRS @@ -158,7 +158,7 @@ add_fp_unittest( truncf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS truncf_test.cpp HDRS @@ -173,7 +173,7 @@ add_fp_unittest( truncl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS truncl_test.cpp HDRS @@ -188,7 +188,7 @@ add_fp_unittest( ceil_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS ceil_test.cpp HDRS @@ -203,7 +203,7 @@ add_fp_unittest( ceilf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS ceilf_test.cpp HDRS @@ -218,7 +218,7 @@ add_fp_unittest( ceill_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS ceill_test.cpp HDRS @@ -233,7 +233,7 @@ add_fp_unittest( floor_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS floor_test.cpp HDRS @@ -248,7 +248,7 @@ add_fp_unittest( floorf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS floorf_test.cpp HDRS @@ -263,7 +263,7 @@ add_fp_unittest( floorl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS floorl_test.cpp HDRS @@ -278,7 +278,7 @@ add_fp_unittest( round_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS round_test.cpp HDRS @@ -293,7 +293,7 @@ add_fp_unittest( roundf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS roundf_test.cpp HDRS @@ -308,7 +308,7 @@ add_fp_unittest( roundl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS roundl_test.cpp HDRS @@ -323,7 +323,7 @@ add_fp_unittest( lround_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS lround_test.cpp HDRS @@ -342,7 +342,7 @@ add_fp_unittest( lroundf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS lroundf_test.cpp HDRS @@ -361,7 +361,7 @@ add_fp_unittest( lroundl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS lroundl_test.cpp HDRS @@ -380,7 +380,7 @@ add_fp_unittest( llround_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS llround_test.cpp HDRS @@ -399,7 +399,7 @@ add_fp_unittest( llroundf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS llroundf_test.cpp HDRS @@ -418,7 +418,7 @@ add_fp_unittest( llroundl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS llroundl_test.cpp HDRS @@ -437,7 +437,7 @@ add_fp_unittest( rint_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS rint_test.cpp HDRS @@ -453,7 +453,7 @@ add_fp_unittest( rintf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS rintf_test.cpp HDRS @@ -469,7 +469,7 @@ add_fp_unittest( rintl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS rintl_test.cpp HDRS @@ -485,7 +485,7 @@ add_fp_unittest( lrint_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS lrint_test.cpp HDRS @@ -501,7 +501,7 @@ add_fp_unittest( lrintf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS lrintf_test.cpp HDRS @@ -517,7 +517,7 @@ add_fp_unittest( lrintl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS lrintl_test.cpp HDRS @@ -533,7 +533,7 @@ add_fp_unittest( llrint_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS llrint_test.cpp HDRS @@ -549,7 +549,7 @@ add_fp_unittest( llrintf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS llrintf_test.cpp HDRS @@ -565,7 +565,7 @@ add_fp_unittest( llrintl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS llrintl_test.cpp HDRS @@ -581,7 +581,7 @@ add_fp_unittest( expf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS expf_test.cpp DEPENDS @@ -595,7 +595,7 @@ add_fp_unittest( exp_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS exp_test.cpp DEPENDS @@ -609,7 +609,7 @@ add_fp_unittest( exp2f_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS exp2f_test.cpp DEPENDS @@ -623,7 +623,7 @@ add_fp_unittest( exp2_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS exp2_test.cpp DEPENDS @@ -637,7 +637,7 @@ add_fp_unittest( exp10f_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS exp10f_test.cpp DEPENDS @@ -651,7 +651,7 @@ add_fp_unittest( exp10_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS exp10_test.cpp DEPENDS @@ -664,7 +664,7 @@ add_fp_unittest( add_fp_unittest( copysign_test SUITE - libc_math_unittests + libc-math-unittests SRCS copysign_test.cpp HDRS @@ -678,7 +678,7 @@ add_fp_unittest( add_fp_unittest( copysignf_test SUITE - libc_math_unittests + libc-math-unittests SRCS copysignf_test.cpp HDRS @@ -692,7 +692,7 @@ add_fp_unittest( add_fp_unittest( copysignl_test SUITE - libc_math_unittests + libc-math-unittests SRCS copysignl_test.cpp HDRS @@ -707,7 +707,7 @@ add_fp_unittest( frexp_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS frexp_test.cpp HDRS @@ -722,7 +722,7 @@ add_fp_unittest( frexpf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS frexpf_test.cpp HDRS @@ -737,7 +737,7 @@ add_fp_unittest( frexpl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS frexpl_test.cpp HDRS @@ -753,7 +753,7 @@ if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( ilogb_test SUITE - libc_math_unittests + libc-math-unittests SRCS ilogb_test.cpp HDRS @@ -768,7 +768,7 @@ if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( ilogbf_test SUITE - libc_math_unittests + libc-math-unittests SRCS ilogbf_test.cpp HDRS @@ -784,7 +784,7 @@ endif() add_fp_unittest( ilogbl_test SUITE - libc_math_unittests + libc-math-unittests SRCS ilogbl_test.cpp HDRS @@ -799,7 +799,7 @@ add_fp_unittest( add_fp_unittest( ldexp_test SUITE - libc_math_unittests + libc-math-unittests SRCS ldexp_test.cpp HDRS @@ -814,7 +814,7 @@ add_fp_unittest( add_fp_unittest( ldexpf_test SUITE - libc_math_unittests + libc-math-unittests SRCS ldexpf_test.cpp HDRS @@ -829,7 +829,7 @@ add_fp_unittest( add_fp_unittest( ldexpl_test SUITE - libc_math_unittests + libc-math-unittests SRCS ldexpl_test.cpp HDRS @@ -844,7 +844,7 @@ add_fp_unittest( add_fp_unittest( logb_test SUITE - libc_math_unittests + libc-math-unittests SRCS logb_test.cpp DEPENDS @@ -856,7 +856,7 @@ add_fp_unittest( add_fp_unittest( logbf_test SUITE - libc_math_unittests + libc-math-unittests SRCS logbf_test.cpp DEPENDS @@ -868,7 +868,7 @@ add_fp_unittest( add_fp_unittest( logbl_test SUITE - libc_math_unittests + libc-math-unittests SRCS logbl_test.cpp HDRS @@ -882,7 +882,7 @@ add_fp_unittest( add_fp_unittest( modf_test SUITE - libc_math_unittests + libc-math-unittests SRCS modf_test.cpp HDRS @@ -899,7 +899,7 @@ add_fp_unittest( add_fp_unittest( modff_test SUITE - libc_math_unittests + libc-math-unittests SRCS modff_test.cpp HDRS @@ -916,7 +916,7 @@ add_fp_unittest( add_fp_unittest( modfl_test SUITE - libc_math_unittests + libc-math-unittests SRCS modfl_test.cpp HDRS @@ -931,7 +931,7 @@ add_fp_unittest( add_fp_unittest( fdimf_test SUITE - libc_math_unittests + libc-math-unittests SRCS fdimf_test.cpp HDRS @@ -946,7 +946,7 @@ add_fp_unittest( add_fp_unittest( fdim_test SUITE - libc_math_unittests + libc-math-unittests SRCS fdim_test.cpp HDRS @@ -961,7 +961,7 @@ add_fp_unittest( add_fp_unittest( fdiml_test SUITE - libc_math_unittests + libc-math-unittests SRCS fdiml_test.cpp HDRS @@ -978,7 +978,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) add_fp_unittest( fminf_test SUITE - libc_math_unittests + libc-math-unittests SRCS fminf_test.cpp HDRS @@ -992,7 +992,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) add_fp_unittest( fmin_test SUITE - libc_math_unittests + libc-math-unittests SRCS fmin_test.cpp HDRS @@ -1006,7 +1006,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) add_fp_unittest( fminl_test SUITE - libc_math_unittests + libc-math-unittests SRCS fminl_test.cpp HDRS @@ -1020,7 +1020,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) add_fp_unittest( fmaxf_test SUITE - libc_math_unittests + libc-math-unittests SRCS fmaxf_test.cpp HDRS @@ -1034,7 +1034,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) add_fp_unittest( fmax_test SUITE - libc_math_unittests + libc-math-unittests SRCS fmax_test.cpp HDRS @@ -1048,7 +1048,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) add_fp_unittest( fmaxl_test SUITE - libc_math_unittests + libc-math-unittests SRCS fmaxl_test.cpp HDRS @@ -1064,7 +1064,7 @@ add_fp_unittest( sqrtf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS sqrtf_test.cpp DEPENDS @@ -1077,7 +1077,7 @@ add_fp_unittest( sqrt_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS sqrt_test.cpp DEPENDS @@ -1090,7 +1090,7 @@ add_fp_unittest( sqrtl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS sqrtl_test.cpp DEPENDS @@ -1103,7 +1103,7 @@ add_fp_unittest( generic_sqrtf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS generic_sqrtf_test.cpp DEPENDS @@ -1118,7 +1118,7 @@ add_fp_unittest( generic_sqrt_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS generic_sqrt_test.cpp DEPENDS @@ -1133,7 +1133,7 @@ add_fp_unittest( generic_sqrtl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS generic_sqrtl_test.cpp DEPENDS @@ -1148,7 +1148,7 @@ add_fp_unittest( remquof_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS remquof_test.cpp HDRS @@ -1164,7 +1164,7 @@ add_fp_unittest( remquo_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS remquo_test.cpp HDRS @@ -1180,7 +1180,7 @@ add_fp_unittest( remquol_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS remquol_test.cpp HDRS @@ -1196,7 +1196,7 @@ add_fp_unittest( hypotf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS hypotf_test.cpp DEPENDS @@ -1209,7 +1209,7 @@ add_fp_unittest( hypot_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS hypot_test.cpp DEPENDS @@ -1223,7 +1223,7 @@ if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( nextafter_test SUITE - libc_math_unittests + libc-math-unittests SRCS nextafter_test.cpp HDRS @@ -1238,7 +1238,7 @@ if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) add_fp_unittest( nextafterf_test SUITE - libc_math_unittests + libc-math-unittests SRCS nextafterf_test.cpp HDRS @@ -1254,7 +1254,7 @@ endif() add_fp_unittest( nextafterl_test SUITE - libc_math_unittests + libc-math-unittests SRCS nextafterl_test.cpp HDRS @@ -1272,7 +1272,7 @@ add_fp_unittest( fmaf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS fmaf_test.cpp DEPENDS @@ -1287,7 +1287,7 @@ add_fp_unittest( fma_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS fma_test.cpp DEPENDS @@ -1300,7 +1300,7 @@ add_fp_unittest( tan_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS tan_test.cpp DEPENDS @@ -1312,7 +1312,7 @@ add_fp_unittest( expm1f_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS expm1f_test.cpp DEPENDS @@ -1326,7 +1326,7 @@ add_fp_unittest( log_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS log_test.cpp DEPENDS @@ -1340,7 +1340,7 @@ add_fp_unittest( logf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS logf_test.cpp DEPENDS @@ -1354,7 +1354,7 @@ add_fp_unittest( log2_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS log2_test.cpp DEPENDS @@ -1368,7 +1368,7 @@ add_fp_unittest( log2f_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS log2f_test.cpp DEPENDS @@ -1382,7 +1382,7 @@ add_fp_unittest( log10_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS log10_test.cpp DEPENDS @@ -1396,7 +1396,7 @@ add_fp_unittest( log10f_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS log10f_test.cpp DEPENDS @@ -1410,7 +1410,7 @@ add_fp_unittest( log1p_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS log1p_test.cpp DEPENDS @@ -1424,7 +1424,7 @@ add_fp_unittest( log1pf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS log1pf_test.cpp DEPENDS @@ -1437,7 +1437,7 @@ add_fp_unittest( add_fp_unittest( fmodf_test SUITE - libc_math_unittests + libc-math-unittests SRCS fmodf_test.cpp HDRS @@ -1455,7 +1455,7 @@ add_fp_unittest( add_fp_unittest( fmod_test SUITE - libc_math_unittests + libc-math-unittests SRCS fmod_test.cpp HDRS @@ -1474,7 +1474,7 @@ add_fp_unittest( explogxf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests HDRS in_float_range_test_helper.h SRCS @@ -1489,7 +1489,7 @@ add_fp_unittest( coshf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS coshf_test.cpp HDRS @@ -1505,7 +1505,7 @@ add_fp_unittest( sinhf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS sinhf_test.cpp HDRS @@ -1521,7 +1521,7 @@ add_fp_unittest( tanhf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS tanhf_test.cpp DEPENDS @@ -1533,7 +1533,7 @@ add_fp_unittest( atanhf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS atanhf_test.cpp DEPENDS @@ -1546,7 +1546,7 @@ add_fp_unittest( asinhf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS asinhf_test.cpp DEPENDS @@ -1559,7 +1559,7 @@ add_fp_unittest( acoshf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS acoshf_test.cpp DEPENDS @@ -1572,7 +1572,7 @@ add_fp_unittest( asinf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS asinf_test.cpp DEPENDS @@ -1585,7 +1585,7 @@ add_fp_unittest( acosf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS acosf_test.cpp DEPENDS @@ -1598,7 +1598,7 @@ add_fp_unittest( atanf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS atanf_test.cpp DEPENDS @@ -1611,7 +1611,7 @@ add_fp_unittest( inv_trigf_utils_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests HDRS in_float_range_test_helper.h SRCS @@ -1625,7 +1625,7 @@ add_fp_unittest( scalbn_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS scalbn_test.cpp HDRS @@ -1641,7 +1641,7 @@ add_fp_unittest( scalbnf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS scalbnf_test.cpp HDRS @@ -1657,7 +1657,7 @@ add_fp_unittest( scalbnl_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS scalbnl_test.cpp HDRS @@ -1673,7 +1673,7 @@ add_fp_unittest( erff_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS erff_test.cpp DEPENDS @@ -1683,6 +1683,7 @@ add_fp_unittest( ) add_subdirectory(generic) +add_subdirectory(smoke) if(NOT LLVM_LIBC_FULL_BUILD) add_subdirectory(exhaustive) diff --git a/libc/test/src/math/generic/CMakeLists.txt b/libc/test/src/math/generic/CMakeLists.txt index 9803f34873e5d..bc2b5cea89d12 100644 --- a/libc/test/src/math/generic/CMakeLists.txt +++ b/libc/test/src/math/generic/CMakeLists.txt @@ -2,7 +2,7 @@ add_fp_unittest( ceil_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS ../ceil_test.cpp DEPENDS @@ -14,7 +14,7 @@ add_fp_unittest( ceilf_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS ../ceilf_test.cpp DEPENDS @@ -26,7 +26,7 @@ add_fp_unittest( ceill_test NEED_MPFR SUITE - libc_math_unittests + libc-math-unittests SRCS ../ceill_test.cpp DEPENDS diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt new file mode 100644 index 0000000000000..803f6d2b9389b --- /dev/null +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -0,0 +1,1523 @@ +add_custom_target(libc-math-smoke-tests) +add_dependencies(libc-math-unittests libc-math-smoke-tests) + +add_fp_unittest( + cosf_test + SUITE + libc-math-smoke-tests + SRCS + cosf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.cosf + libc.src.__support.CPP.array + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + sinf_test + SUITE + libc-math-smoke-tests + SRCS + sinf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.sinf + libc.src.__support.CPP.array + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + sincosf_test + SUITE + libc-math-smoke-tests + SRCS + sincosf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.sincosf + libc.src.__support.CPP.array + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + tanf_test + SUITE + libc-math-smoke-tests + SRCS + tanf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.tanf + libc.src.__support.CPP.array + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fabs_test + SUITE + libc-math-smoke-tests + SRCS + fabs_test.cpp + HDRS + FAbsTest.h + DEPENDS + libc.include.math + libc.src.math.fabs + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fabsf_test + SUITE + libc-math-smoke-tests + SRCS + fabsf_test.cpp + HDRS + FAbsTest.h + DEPENDS + libc.include.math + libc.src.math.fabsf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fabsl_test + SUITE + libc-math-smoke-tests + SRCS + fabsl_test.cpp + HDRS + FAbsTest.h + DEPENDS + libc.include.math + libc.src.math.fabsl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + trunc_test + SUITE + libc-math-smoke-tests + SRCS + trunc_test.cpp + HDRS + TruncTest.h + DEPENDS + libc.include.math + libc.src.math.trunc + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + truncf_test + SUITE + libc-math-smoke-tests + SRCS + truncf_test.cpp + HDRS + TruncTest.h + DEPENDS + libc.include.math + libc.src.math.truncf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + truncl_test + SUITE + libc-math-smoke-tests + SRCS + truncl_test.cpp + HDRS + TruncTest.h + DEPENDS + libc.include.math + libc.src.math.truncl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + ceil_test + SUITE + libc-math-smoke-tests + SRCS + ceil_test.cpp + HDRS + CeilTest.h + DEPENDS + libc.include.math + libc.src.math.ceil + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + ceilf_test + SUITE + libc-math-smoke-tests + SRCS + ceilf_test.cpp + HDRS + CeilTest.h + DEPENDS + libc.include.math + libc.src.math.ceilf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + ceill_test + SUITE + libc-math-smoke-tests + SRCS + ceill_test.cpp + HDRS + CeilTest.h + DEPENDS + libc.include.math + libc.src.math.ceill + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + floor_test + SUITE + libc-math-smoke-tests + SRCS + floor_test.cpp + HDRS + FloorTest.h + DEPENDS + libc.include.math + libc.src.math.floor + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + floorf_test + SUITE + libc-math-smoke-tests + SRCS + floorf_test.cpp + HDRS + FloorTest.h + DEPENDS + libc.include.math + libc.src.math.floorf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + floorl_test + SUITE + libc-math-smoke-tests + SRCS + floorl_test.cpp + HDRS + FloorTest.h + DEPENDS + libc.include.math + libc.src.math.floorl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + round_test + SUITE + libc-math-smoke-tests + SRCS + round_test.cpp + HDRS + RoundTest.h + DEPENDS + libc.include.math + libc.src.math.round + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + roundf_test + SUITE + libc-math-smoke-tests + SRCS + roundf_test.cpp + HDRS + RoundTest.h + DEPENDS + libc.include.math + libc.src.math.roundf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + roundl_test + SUITE + libc-math-smoke-tests + SRCS + roundl_test.cpp + HDRS + RoundTest.h + DEPENDS + libc.include.math + libc.src.math.roundl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + lround_test + SUITE + libc-math-smoke-tests + SRCS + lround_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.errno.errno + libc.src.fenv.feclearexcept + libc.src.fenv.feraiseexcept + libc.src.fenv.fetestexcept + libc.src.math.lround + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + lroundf_test + SUITE + libc-math-smoke-tests + SRCS + lroundf_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.errno.errno + libc.src.fenv.feclearexcept + libc.src.fenv.feraiseexcept + libc.src.fenv.fetestexcept + libc.src.math.lroundf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + lroundl_test + SUITE + libc-math-smoke-tests + SRCS + lroundl_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.errno.errno + libc.src.fenv.feclearexcept + libc.src.fenv.feraiseexcept + libc.src.fenv.fetestexcept + libc.src.math.lroundl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + llround_test + SUITE + libc-math-smoke-tests + SRCS + llround_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.errno.errno + libc.src.fenv.feclearexcept + libc.src.fenv.feraiseexcept + libc.src.fenv.fetestexcept + libc.src.math.llround + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + llroundf_test + SUITE + libc-math-smoke-tests + SRCS + llroundf_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.errno.errno + libc.src.fenv.feclearexcept + libc.src.fenv.feraiseexcept + libc.src.fenv.fetestexcept + libc.src.math.llroundf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + llroundl_test + SUITE + libc-math-smoke-tests + SRCS + llroundl_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.errno.errno + libc.src.fenv.feclearexcept + libc.src.fenv.feraiseexcept + libc.src.fenv.fetestexcept + libc.src.math.llroundl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + rint_test + SUITE + libc-math-smoke-tests + SRCS + rint_test.cpp + HDRS + RIntTest.h + DEPENDS + libc.include.math + libc.src.math.rint + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + rintf_test + SUITE + libc-math-smoke-tests + SRCS + rintf_test.cpp + HDRS + RIntTest.h + DEPENDS + libc.include.math + libc.src.math.rintf + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + rintl_test + SUITE + libc-math-smoke-tests + SRCS + rintl_test.cpp + HDRS + RIntTest.h + DEPENDS + libc.include.math + libc.src.math.rintl + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + lrint_test + SUITE + libc-math-smoke-tests + SRCS + lrint_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.math.lrint + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + lrintf_test + SUITE + libc-math-smoke-tests + SRCS + lrintf_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.math.lrintf + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + lrintl_test + SUITE + libc-math-smoke-tests + SRCS + lrintl_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.math.lrintl + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + llrint_test + SUITE + libc-math-smoke-tests + SRCS + llrint_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.math.llrint + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + llrintf_test + SUITE + libc-math-smoke-tests + SRCS + llrintf_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.math.llrintf + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + llrintl_test + SUITE + libc-math-smoke-tests + SRCS + llrintl_test.cpp + HDRS + RoundToIntegerTest.h + DEPENDS + libc.include.math + libc.src.math.llrintl + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + expf_test + SUITE + libc-math-smoke-tests + SRCS + expf_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.expf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + exp_test + SUITE + libc-math-smoke-tests + SRCS + exp_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.exp + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + exp2f_test + SUITE + libc-math-smoke-tests + SRCS + exp2f_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.exp2f + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + exp2_test + SUITE + libc-math-smoke-tests + SRCS + exp2_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.exp2 + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + exp10f_test + SUITE + libc-math-smoke-tests + SRCS + exp10f_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.exp10f + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + exp10_test + SUITE + libc-math-smoke-tests + SRCS + exp10_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.exp10 + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + copysign_test + SUITE + libc-math-smoke-tests + SRCS + copysign_test.cpp + HDRS + CopySignTest.h + DEPENDS + libc.include.math + libc.src.math.copysign + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + copysignf_test + SUITE + libc-math-smoke-tests + SRCS + copysignf_test.cpp + HDRS + CopySignTest.h + DEPENDS + libc.include.math + libc.src.math.copysignf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + copysignl_test + SUITE + libc-math-smoke-tests + SRCS + copysignl_test.cpp + HDRS + CopySignTest.h + DEPENDS + libc.include.math + libc.src.math.copysignl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + frexp_test + SUITE + libc-math-smoke-tests + SRCS + frexp_test.cpp + HDRS + FrexpTest.h + DEPENDS + libc.include.math + libc.src.math.frexp + libc.src.__support.FPUtil.basic_operations +) + +add_fp_unittest( + frexpf_test + SUITE + libc-math-smoke-tests + SRCS + frexpf_test.cpp + HDRS + FrexpTest.h + DEPENDS + libc.include.math + libc.src.math.frexpf + libc.src.__support.FPUtil.basic_operations +) + +add_fp_unittest( + frexpl_test + SUITE + libc-math-smoke-tests + SRCS + frexpl_test.cpp + HDRS + FrexpTest.h + DEPENDS + libc.include.math + libc.src.math.frexpl + libc.src.__support.FPUtil.basic_operations +) + +# FIXME: These tests are currently broken for NVPTX. +if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) + add_fp_unittest( + ilogb_test + SUITE + libc-math-smoke-tests + SRCS + ilogb_test.cpp + HDRS + ILogbTest.h + DEPENDS + libc.include.math + libc.src.math.ilogb + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.manipulation_functions + ) + + add_fp_unittest( + ilogbf_test + SUITE + libc-math-smoke-tests + SRCS + ilogbf_test.cpp + HDRS + ILogbTest.h + DEPENDS + libc.include.math + libc.src.math.ilogbf + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.manipulation_functions + ) +endif() + +add_fp_unittest( + ilogbl_test + SUITE + libc-math-smoke-tests + SRCS + ilogbl_test.cpp + HDRS + ILogbTest.h + DEPENDS + libc.include.math + libc.src.math.ilogbl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.manipulation_functions +) + +add_fp_unittest( + ldexp_test + SUITE + libc-math-smoke-tests + SRCS + ldexp_test.cpp + HDRS + LdExpTest.h + DEPENDS + libc.include.math + libc.src.math.ldexp + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.normal_float +) + +add_fp_unittest( + ldexpf_test + SUITE + libc-math-smoke-tests + SRCS + ldexpf_test.cpp + HDRS + LdExpTest.h + DEPENDS + libc.include.math + libc.src.math.ldexpf + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.normal_float +) + +add_fp_unittest( + ldexpl_test + SUITE + libc-math-smoke-tests + SRCS + ldexpl_test.cpp + HDRS + LdExpTest.h + DEPENDS + libc.include.math + libc.src.math.ldexpl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.normal_float +) + +add_fp_unittest( + logb_test + SUITE + libc-math-smoke-tests + SRCS + logb_test.cpp + DEPENDS + libc.include.math + libc.src.math.logb + libc.src.__support.FPUtil.manipulation_functions +) + +add_fp_unittest( + logbf_test + SUITE + libc-math-smoke-tests + SRCS + logbf_test.cpp + DEPENDS + libc.include.math + libc.src.math.logbf + libc.src.__support.FPUtil.manipulation_functions +) + +add_fp_unittest( + logbl_test + SUITE + libc-math-smoke-tests + SRCS + logbl_test.cpp + HDRS + LogbTest.h + DEPENDS + libc.include.math + libc.src.math.logbl + libc.src.__support.FPUtil.manipulation_functions +) + +add_fp_unittest( + modf_test + SUITE + libc-math-smoke-tests + SRCS + modf_test.cpp + HDRS + ModfTest.h + DEPENDS + libc.include.math + libc.src.math.modf + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.nearest_integer_operations + # Requires C++ limits. + UNIT_TEST_ONLY +) + +add_fp_unittest( + modff_test + SUITE + libc-math-smoke-tests + SRCS + modff_test.cpp + HDRS + ModfTest.h + DEPENDS + libc.include.math + libc.src.math.modff + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.nearest_integer_operations + # Requires C++ limits. + UNIT_TEST_ONLY +) + +add_fp_unittest( + modfl_test + SUITE + libc-math-smoke-tests + SRCS + modfl_test.cpp + HDRS + ModfTest.h + DEPENDS + libc.include.math + libc.src.math.modfl + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.nearest_integer_operations +) + +add_fp_unittest( + fdimf_test + SUITE + libc-math-smoke-tests + SRCS + fdimf_test.cpp + HDRS + FDimTest.h + DEPENDS + libc.include.math + libc.src.math.fdimf + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fdim_test + SUITE + libc-math-smoke-tests + SRCS + fdim_test.cpp + HDRS + FDimTest.h + DEPENDS + libc.include.math + libc.src.math.fdim + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fdiml_test + SUITE + libc-math-smoke-tests + SRCS + fdiml_test.cpp + HDRS + FDimTest.h + DEPENDS + libc.include.math + libc.src.math.fdiml + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +# FIXME: These tests are currently broken on the GPU. +if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU) + add_fp_unittest( + fminf_test + SUITE + libc-math-smoke-tests + SRCS + fminf_test.cpp + HDRS + FMinTest.h + DEPENDS + libc.include.math + libc.src.math.fminf + libc.src.__support.FPUtil.fp_bits + ) + + add_fp_unittest( + fmin_test + SUITE + libc-math-smoke-tests + SRCS + fmin_test.cpp + HDRS + FMinTest.h + DEPENDS + libc.include.math + libc.src.math.fmin + libc.src.__support.FPUtil.fp_bits + ) + + add_fp_unittest( + fminl_test + SUITE + libc-math-smoke-tests + SRCS + fminl_test.cpp + HDRS + FMinTest.h + DEPENDS + libc.include.math + libc.src.math.fminl + libc.src.__support.FPUtil.fp_bits + ) + + add_fp_unittest( + fmaxf_test + SUITE + libc-math-smoke-tests + SRCS + fmaxf_test.cpp + HDRS + FMaxTest.h + DEPENDS + libc.include.math + libc.src.math.fmaxf + libc.src.__support.FPUtil.fp_bits + ) + + add_fp_unittest( + fmax_test + SUITE + libc-math-smoke-tests + SRCS + fmax_test.cpp + HDRS + FMaxTest.h + DEPENDS + libc.include.math + libc.src.math.fmax + libc.src.__support.FPUtil.fp_bits + ) + + add_fp_unittest( + fmaxl_test + SUITE + libc-math-smoke-tests + SRCS + fmaxl_test.cpp + HDRS + FMaxTest.h + DEPENDS + libc.include.math + libc.src.math.fmaxl + libc.src.__support.FPUtil.fp_bits + ) +endif() + +add_fp_unittest( + sqrtf_test + SUITE + libc-math-smoke-tests + SRCS + sqrtf_test.cpp + DEPENDS + libc.include.math + libc.src.math.sqrtf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + sqrt_test + SUITE + libc-math-smoke-tests + SRCS + sqrt_test.cpp + DEPENDS + libc.include.math + libc.src.math.sqrt + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + sqrtl_test + SUITE + libc-math-smoke-tests + SRCS + sqrtl_test.cpp + DEPENDS + libc.include.math + libc.src.math.sqrtl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + generic_sqrtf_test + SUITE + libc-math-smoke-tests + SRCS + generic_sqrtf_test.cpp + DEPENDS + libc.src.math.sqrtf + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.generic.sqrt + COMPILE_OPTIONS + -O3 +) + +add_fp_unittest( + generic_sqrt_test + SUITE + libc-math-smoke-tests + SRCS + generic_sqrt_test.cpp + DEPENDS + libc.src.math.sqrt + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.generic.sqrt + COMPILE_OPTIONS + -O3 +) + +add_fp_unittest( + generic_sqrtl_test + SUITE + libc-math-smoke-tests + SRCS + generic_sqrtl_test.cpp + DEPENDS + libc.src.math.sqrtl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.generic.sqrt + COMPILE_OPTIONS + -O3 +) + +add_fp_unittest( + remquof_test + SUITE + libc-math-smoke-tests + SRCS + remquof_test.cpp + HDRS + RemQuoTest.h + DEPENDS + libc.include.math + libc.src.math.remquof + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + remquo_test + SUITE + libc-math-smoke-tests + SRCS + remquo_test.cpp + HDRS + RemQuoTest.h + DEPENDS + libc.include.math + libc.src.math.remquo + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + remquol_test + SUITE + libc-math-smoke-tests + SRCS + remquol_test.cpp + HDRS + RemQuoTest.h + DEPENDS + libc.include.math + libc.src.math.remquol + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + hypotf_test + SUITE + libc-math-smoke-tests + SRCS + hypotf_test.cpp + DEPENDS + libc.include.math + libc.src.math.hypotf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + hypot_test + SUITE + libc-math-smoke-tests + SRCS + hypot_test.cpp + DEPENDS + libc.include.math + libc.src.math.hypot + libc.src.__support.FPUtil.fp_bits +) + +# FIXME: These tests are currently spurious for NVPTX. +if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX) + add_fp_unittest( + nextafter_test + SUITE + libc-math-smoke-tests + SRCS + nextafter_test.cpp + HDRS + NextAfterTest.h + DEPENDS + libc.include.math + libc.src.math.nextafter + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits + ) + + add_fp_unittest( + nextafterf_test + SUITE + libc-math-smoke-tests + SRCS + nextafterf_test.cpp + HDRS + NextAfterTest.h + DEPENDS + libc.include.math + libc.src.math.nextafterf + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits + ) +endif() + +add_fp_unittest( + nextafterl_test + SUITE + libc-math-smoke-tests + SRCS + nextafterl_test.cpp + HDRS + NextAfterTest.h + DEPENDS + libc.include.math + libc.src.math.nextafterl + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +# TODO(lntue): The current implementation of fputil::general::fma is only +# correctly rounded for the default rounding mode round-to-nearest tie-to-even. +add_fp_unittest( + fmaf_test + SUITE + libc-math-smoke-tests + SRCS + fmaf_test.cpp + DEPENDS + libc.include.math + libc.src.math.fmaf + libc.src.__support.FPUtil.fp_bits + FLAGS + FMA_OPT__ONLY +) + +add_fp_unittest( + fma_test + SUITE + libc-math-smoke-tests + SRCS + fma_test.cpp + DEPENDS + libc.include.math + libc.src.math.fma + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + expm1f_test + SUITE + libc-math-smoke-tests + SRCS + expm1f_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.expm1f + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + log_test + SUITE + libc-math-smoke-tests + SRCS + log_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.log + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + logf_test + SUITE + libc-math-smoke-tests + SRCS + logf_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.logf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + log2_test + SUITE + libc-math-smoke-tests + SRCS + log2_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.log2 + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + log2f_test + SUITE + libc-math-smoke-tests + SRCS + log2f_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.log2f + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + log10_test + SUITE + libc-math-smoke-tests + SRCS + log10_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.log10 + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + log10f_test + SUITE + libc-math-smoke-tests + SRCS + log10f_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.log10f + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + log1p_test + SUITE + libc-math-smoke-tests + SRCS + log1p_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.log1p + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + log1pf_test + SUITE + libc-math-smoke-tests + SRCS + log1pf_test.cpp + DEPENDS + libc.src.errno.errno + libc.include.math + libc.src.math.log1pf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fmodf_test + SUITE + libc-math-smoke-tests + SRCS + fmodf_test.cpp + HDRS + FModTest.h + DEPENDS + libc.include.math + libc.src.errno.errno + libc.src.math.fmodf + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.nearest_integer_operations + # Requires C++ limits. + UNIT_TEST_ONLY +) + +add_fp_unittest( + fmod_test + SUITE + libc-math-smoke-tests + SRCS + fmod_test.cpp + HDRS + FModTest.h + DEPENDS + libc.include.math + libc.src.errno.errno + libc.src.math.fmod + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.nearest_integer_operations + # Requires C++ limits. + UNIT_TEST_ONLY +) + +add_fp_unittest( + coshf_test + SUITE + libc-math-smoke-tests + SRCS + coshf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.coshf + libc.src.__support.CPP.array + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + sinhf_test + SUITE + libc-math-smoke-tests + SRCS + sinhf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.sinhf + libc.src.__support.CPP.array + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + tanhf_test + SUITE + libc-math-smoke-tests + SRCS + tanhf_test.cpp + DEPENDS + libc.src.math.tanhf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + atanhf_test + SUITE + libc-math-smoke-tests + SRCS + atanhf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.atanhf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + asinhf_test + SUITE + libc-math-smoke-tests + SRCS + asinhf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.asinhf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + acoshf_test + SUITE + libc-math-smoke-tests + SRCS + acoshf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.acoshf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + asinf_test + SUITE + libc-math-smoke-tests + SRCS + asinf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.asinf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + acosf_test + SUITE + libc-math-smoke-tests + SRCS + acosf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.acosf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + atanf_test + SUITE + libc-math-smoke-tests + SRCS + atanf_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.atanf + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + scalbn_test + SUITE + libc-math-smoke-tests + SRCS + scalbn_test.cpp + HDRS + ScalbnTest.h + DEPENDS + libc.include.math + libc.src.math.scalbn + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.normal_float +) + +add_fp_unittest( + scalbnf_test + SUITE + libc-math-smoke-tests + SRCS + scalbnf_test.cpp + HDRS + ScalbnTest.h + DEPENDS + libc.include.math + libc.src.math.scalbnf + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.normal_float +) + +add_fp_unittest( + scalbnl_test + SUITE + libc-math-smoke-tests + SRCS + scalbnl_test.cpp + HDRS + ScalbnTest.h + DEPENDS + libc.include.math + libc.src.math.scalbnl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.normal_float +) + +add_fp_unittest( + erff_test + SUITE + libc-math-smoke-tests + SRCS + erff_test.cpp + DEPENDS + libc.include.math + libc.src.math.erff + libc.src.__support.FPUtil.fp_bits +) diff --git a/libc/test/src/math/smoke/CeilTest.h b/libc/test/src/math/smoke/CeilTest.h new file mode 100644 index 0000000000000..053218386cbcf --- /dev/null +++ b/libc/test/src/math/smoke/CeilTest.h @@ -0,0 +1,68 @@ +//===-- Utility class to test ceil[f|l] -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class CeilTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*CeilFunc)(T); + + void testSpecialNumbers(CeilFunc func) { + EXPECT_FP_EQ(zero, func(zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero)); + + EXPECT_FP_EQ(inf, func(inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf)); + + EXPECT_FP_EQ(aNaN, func(aNaN)); + } + + void testRoundedNumbers(CeilFunc func) { + EXPECT_FP_EQ(T(1.0), func(T(1.0))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.0))); + EXPECT_FP_EQ(T(10.0), func(T(10.0))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.0))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.0))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.0))); + } + + void testFractions(CeilFunc func) { + EXPECT_FP_EQ(T(1.0), func(T(0.5))); + EXPECT_FP_EQ(T(-0.0), func(T(-0.5))); + EXPECT_FP_EQ(T(1.0), func(T(0.115))); + EXPECT_FP_EQ(T(-0.0), func(T(-0.115))); + EXPECT_FP_EQ(T(1.0), func(T(0.715))); + EXPECT_FP_EQ(T(-0.0), func(T(-0.715))); + EXPECT_FP_EQ(T(2.0), func(T(1.3))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.3))); + EXPECT_FP_EQ(T(2.0), func(T(1.5))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.5))); + EXPECT_FP_EQ(T(2.0), func(T(1.75))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.75))); + EXPECT_FP_EQ(T(11.0), func(T(10.32))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.32))); + EXPECT_FP_EQ(T(11.0), func(T(10.65))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.65))); + EXPECT_FP_EQ(T(1235.0), func(T(1234.38))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.38))); + EXPECT_FP_EQ(T(1235.0), func(T(1234.96))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.96))); + } +}; + +#define LIST_CEIL_TESTS(T, func) \ + using LlvmLibcCeilTest = CeilTest; \ + TEST_F(LlvmLibcCeilTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcCeilTest, RoundedNubmers) { testRoundedNumbers(&func); } \ + TEST_F(LlvmLibcCeilTest, Fractions) { testFractions(&func); } diff --git a/libc/test/src/math/smoke/CopySignTest.h b/libc/test/src/math/smoke/CopySignTest.h new file mode 100644 index 0000000000000..2d50dd4877b10 --- /dev/null +++ b/libc/test/src/math/smoke/CopySignTest.h @@ -0,0 +1,52 @@ +//===-- Utility class to test copysign[f|l] ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class CopySignTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*CopySignFunc)(T, T); + + void testSpecialNumbers(CopySignFunc func) { + EXPECT_FP_EQ(aNaN, func(aNaN, -1.0)); + EXPECT_FP_EQ(aNaN, func(aNaN, 1.0)); + + EXPECT_FP_EQ(neg_inf, func(inf, -1.0)); + EXPECT_FP_EQ(inf, func(neg_inf, 1.0)); + + EXPECT_FP_EQ(neg_zero, func(zero, -1.0)); + EXPECT_FP_EQ(zero, func(neg_zero, 1.0)); + } + + void testRange(CopySignFunc func) { + constexpr UIntType COUNT = 100'000; + constexpr UIntType STEP = UIntType(-1) / COUNT; + for (UIntType i = 0, v = 0; i <= COUNT; ++i, v += STEP) { + T x = T(FPBits(v)); + if (isnan(x) || isinf(x)) + continue; + + double res1 = func(x, -x); + ASSERT_FP_EQ(res1, -x); + + double res2 = func(x, x); + ASSERT_FP_EQ(res2, x); + } + } +}; + +#define LIST_COPYSIGN_TESTS(T, func) \ + using LlvmLibcCopySignTest = CopySignTest; \ + TEST_F(LlvmLibcCopySignTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcCopySignTest, Range) { testRange(&func); } diff --git a/libc/test/src/math/smoke/FAbsTest.h b/libc/test/src/math/smoke/FAbsTest.h new file mode 100644 index 0000000000000..df79703d42b1e --- /dev/null +++ b/libc/test/src/math/smoke/FAbsTest.h @@ -0,0 +1,37 @@ +//===-- Utility class to test fabs[f|l] -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class FAbsTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*FabsFunc)(T); + + void testSpecialNumbers(FabsFunc func) { + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN)); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf)); + EXPECT_FP_EQ_ALL_ROUNDING(inf, func(neg_inf)); + + EXPECT_FP_EQ_ALL_ROUNDING(zero, func(zero)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, func(neg_zero)); + + EXPECT_FP_EQ_ALL_ROUNDING(T(1), func(T(1))); + EXPECT_FP_EQ_ALL_ROUNDING(T(1), func(T(-1))); + } +}; + +#define LIST_FABS_TESTS(T, func) \ + using LlvmLibcFAbsTest = FAbsTest; \ + TEST_F(LlvmLibcFAbsTest, SpecialNumbers) { testSpecialNumbers(&func); } diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h new file mode 100644 index 0000000000000..c0dbda10b318b --- /dev/null +++ b/libc/test/src/math/smoke/FDimTest.h @@ -0,0 +1,82 @@ +//===-- Utility class to test different flavors of fdim ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +template +class FDimTestTemplate : public __llvm_libc::testing::Test { +public: + using FuncPtr = T (*)(T, T); + using FPBits = __llvm_libc::fputil::FPBits; + using UIntType = typename FPBits::UIntType; + + void test_na_n_arg(FuncPtr func) { + EXPECT_FP_EQ(nan, func(nan, inf)); + EXPECT_FP_EQ(nan, func(neg_inf, nan)); + EXPECT_FP_EQ(nan, func(nan, zero)); + EXPECT_FP_EQ(nan, func(neg_zero, nan)); + EXPECT_FP_EQ(nan, func(nan, T(-1.2345))); + EXPECT_FP_EQ(nan, func(T(1.2345), nan)); + EXPECT_FP_EQ(func(nan, nan), nan); + } + + void test_inf_arg(FuncPtr func) { + EXPECT_FP_EQ(zero, func(neg_inf, inf)); + EXPECT_FP_EQ(inf, func(inf, zero)); + EXPECT_FP_EQ(zero, func(neg_zero, inf)); + EXPECT_FP_EQ(inf, func(inf, T(1.2345))); + EXPECT_FP_EQ(zero, func(T(-1.2345), inf)); + } + + void test_neg_inf_arg(FuncPtr func) { + EXPECT_FP_EQ(inf, func(inf, neg_inf)); + EXPECT_FP_EQ(zero, func(neg_inf, zero)); + EXPECT_FP_EQ(inf, func(neg_zero, neg_inf)); + EXPECT_FP_EQ(zero, func(neg_inf, T(-1.2345))); + EXPECT_FP_EQ(inf, func(T(1.2345), neg_inf)); + } + + void test_both_zero(FuncPtr func) { + EXPECT_FP_EQ(zero, func(zero, zero)); + EXPECT_FP_EQ(zero, func(zero, neg_zero)); + EXPECT_FP_EQ(zero, func(neg_zero, zero)); + EXPECT_FP_EQ(zero, func(neg_zero, neg_zero)); + } + + void test_in_range(FuncPtr func) { + constexpr UIntType COUNT = 100'001; + constexpr UIntType STEP = UIntType(-1) / COUNT; + for (UIntType i = 0, v = 0, w = UIntType(-1); i <= COUNT; + ++i, v += STEP, w -= STEP) { + T x = T(FPBits(v)), y = T(FPBits(w)); + if (isnan(x) || isinf(x)) + continue; + if (isnan(y) || isinf(y)) + continue; + + if (x > y) { + EXPECT_FP_EQ(x - y, func(x, y)); + } else { + EXPECT_FP_EQ(zero, func(x, y)); + } + } + } + +private: + // constexpr does not work on FPBits yet, so we cannot have these constants as + // static. + const T nan = T(__llvm_libc::fputil::FPBits::build_quiet_nan(1)); + const T inf = T(__llvm_libc::fputil::FPBits::inf()); + const T neg_inf = T(__llvm_libc::fputil::FPBits::neg_inf()); + const T zero = T(__llvm_libc::fputil::FPBits::zero()); + const T neg_zero = T(__llvm_libc::fputil::FPBits::neg_zero()); +}; diff --git a/libc/test/src/math/smoke/FMaxTest.h b/libc/test/src/math/smoke/FMaxTest.h new file mode 100644 index 0000000000000..5c353a6b7e0ef --- /dev/null +++ b/libc/test/src/math/smoke/FMaxTest.h @@ -0,0 +1,82 @@ +//===-- Utility class to test fmin[f|l] -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class FMaxTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*FMaxFunc)(T, T); + + void testNaN(FMaxFunc func) { + EXPECT_FP_EQ(inf, func(aNaN, inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, aNaN)); + EXPECT_FP_EQ(0.0, func(aNaN, 0.0)); + EXPECT_FP_EQ(-0.0, func(-0.0, aNaN)); + EXPECT_FP_EQ(T(-1.2345), func(aNaN, T(-1.2345))); + EXPECT_FP_EQ(T(1.2345), func(T(1.2345), aNaN)); + EXPECT_FP_EQ(aNaN, func(aNaN, aNaN)); + } + + void testInfArg(FMaxFunc func) { + EXPECT_FP_EQ(inf, func(neg_inf, inf)); + EXPECT_FP_EQ(inf, func(inf, 0.0)); + EXPECT_FP_EQ(inf, func(-0.0, inf)); + EXPECT_FP_EQ(inf, func(inf, T(1.2345))); + EXPECT_FP_EQ(inf, func(T(-1.2345), inf)); + } + + void testNegInfArg(FMaxFunc func) { + EXPECT_FP_EQ(inf, func(inf, neg_inf)); + EXPECT_FP_EQ(0.0, func(neg_inf, 0.0)); + EXPECT_FP_EQ(-0.0, func(-0.0, neg_inf)); + EXPECT_FP_EQ(T(-1.2345), func(neg_inf, T(-1.2345))); + EXPECT_FP_EQ(T(1.2345), func(T(1.2345), neg_inf)); + } + + void testBothZero(FMaxFunc func) { + EXPECT_FP_EQ(0.0, func(0.0, 0.0)); + EXPECT_FP_EQ(0.0, func(-0.0, 0.0)); + EXPECT_FP_EQ(0.0, func(0.0, -0.0)); + EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + } + + void testRange(FMaxFunc func) { + constexpr UIntType COUNT = 100'001; + constexpr UIntType STEP = UIntType(-1) / COUNT; + for (UIntType i = 0, v = 0, w = UIntType(-1); i <= COUNT; + ++i, v += STEP, w -= STEP) { + T x = T(FPBits(v)), y = T(FPBits(w)); + if (isnan(x) || isinf(x)) + continue; + if (isnan(y) || isinf(y)) + continue; + if ((x == 0) && (y == 0)) + continue; + + if (x > y) { + EXPECT_FP_EQ(x, func(x, y)); + } else { + EXPECT_FP_EQ(y, func(x, y)); + } + } + } +}; + +#define LIST_FMAX_TESTS(T, func) \ + using LlvmLibcFMaxTest = FMaxTest; \ + TEST_F(LlvmLibcFMaxTest, NaN) { testNaN(&func); } \ + TEST_F(LlvmLibcFMaxTest, InfArg) { testInfArg(&func); } \ + TEST_F(LlvmLibcFMaxTest, NegInfArg) { testNegInfArg(&func); } \ + TEST_F(LlvmLibcFMaxTest, BothZero) { testBothZero(&func); } \ + TEST_F(LlvmLibcFMaxTest, Range) { testRange(&func); } diff --git a/libc/test/src/math/smoke/FMinTest.h b/libc/test/src/math/smoke/FMinTest.h new file mode 100644 index 0000000000000..071ab75928c4b --- /dev/null +++ b/libc/test/src/math/smoke/FMinTest.h @@ -0,0 +1,82 @@ +//===-- Utility class to test fmin[f|l] -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class FMinTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*FMinFunc)(T, T); + + void testNaN(FMinFunc func) { + EXPECT_FP_EQ(inf, func(aNaN, inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, aNaN)); + EXPECT_FP_EQ(0.0, func(aNaN, 0.0)); + EXPECT_FP_EQ(-0.0, func(-0.0, aNaN)); + EXPECT_FP_EQ(T(-1.2345), func(aNaN, T(-1.2345))); + EXPECT_FP_EQ(T(1.2345), func(T(1.2345), aNaN)); + EXPECT_FP_EQ(aNaN, func(aNaN, aNaN)); + } + + void testInfArg(FMinFunc func) { + EXPECT_FP_EQ(neg_inf, func(neg_inf, inf)); + EXPECT_FP_EQ(0.0, func(inf, 0.0)); + EXPECT_FP_EQ(-0.0, func(-0.0, inf)); + EXPECT_FP_EQ(T(1.2345), func(inf, T(1.2345))); + EXPECT_FP_EQ(T(-1.2345), func(T(-1.2345), inf)); + } + + void testNegInfArg(FMinFunc func) { + EXPECT_FP_EQ(neg_inf, func(inf, neg_inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, 0.0)); + EXPECT_FP_EQ(neg_inf, func(-0.0, neg_inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf, T(-1.2345))); + EXPECT_FP_EQ(neg_inf, func(T(1.2345), neg_inf)); + } + + void testBothZero(FMinFunc func) { + EXPECT_FP_EQ(0.0, func(0.0, 0.0)); + EXPECT_FP_EQ(-0.0, func(-0.0, 0.0)); + EXPECT_FP_EQ(-0.0, func(0.0, -0.0)); + EXPECT_FP_EQ(-0.0, func(-0.0, -0.0)); + } + + void testRange(FMinFunc func) { + constexpr UIntType COUNT = 100'001; + constexpr UIntType STEP = UIntType(-1) / COUNT; + for (UIntType i = 0, v = 0, w = UIntType(-1); i <= COUNT; + ++i, v += STEP, w -= STEP) { + T x = T(FPBits(v)), y = T(FPBits(w)); + if (isnan(x) || isinf(x)) + continue; + if (isnan(y) || isinf(y)) + continue; + if ((x == 0) && (y == 0)) + continue; + + if (x > y) { + EXPECT_FP_EQ(y, func(x, y)); + } else { + EXPECT_FP_EQ(x, func(x, y)); + } + } + } +}; + +#define LIST_FMIN_TESTS(T, func) \ + using LlvmLibcFMinTest = FMinTest; \ + TEST_F(LlvmLibcFMinTest, NaN) { testNaN(&func); } \ + TEST_F(LlvmLibcFMinTest, InfArg) { testInfArg(&func); } \ + TEST_F(LlvmLibcFMinTest, NegInfArg) { testNegInfArg(&func); } \ + TEST_F(LlvmLibcFMinTest, BothZero) { testBothZero(&func); } \ + TEST_F(LlvmLibcFMinTest, Range) { testRange(&func); } diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h new file mode 100644 index 0000000000000..6bff888294d2b --- /dev/null +++ b/libc/test/src/math/smoke/FModTest.h @@ -0,0 +1,270 @@ +//===-- Utility class to test fmod special numbers ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H + +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/NearestIntegerOperations.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include +#include + +#define TEST_SPECIAL(x, y, expected, dom_err, expected_exception) \ + EXPECT_FP_EQ(expected, f(x, y)); \ + EXPECT_MATH_ERRNO((dom_err) ? EDOM : 0); \ + EXPECT_FP_EXCEPTION(expected_exception); \ + __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT) + +#define TEST_REGULAR(x, y, expected) TEST_SPECIAL(x, y, expected, false, 0) + +template class FmodTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*FModFunc)(T, T); + + void testSpecialNumbers(FModFunc f) { + using nl = std::numeric_limits; + + // fmod (+0, y) == +0 for y != 0. + TEST_SPECIAL(0.0, 3.0, 0.0, false, 0); + TEST_SPECIAL(0.0, nl::denorm_min(), 0.0, false, 0); + TEST_SPECIAL(0.0, -nl::denorm_min(), 0.0, false, 0); + TEST_SPECIAL(0.0, nl::min(), 0.0, false, 0); + TEST_SPECIAL(0.0, -nl::min(), 0.0, false, 0); + TEST_SPECIAL(0.0, nl::max(), 0.0, false, 0); + TEST_SPECIAL(0.0, -nl::max(), 0.0, false, 0); + + // fmod (-0, y) == -0 for y != 0. + TEST_SPECIAL(neg_zero, 3.0, neg_zero, false, 0); + TEST_SPECIAL(neg_zero, nl::denorm_min(), neg_zero, false, 0); + TEST_SPECIAL(neg_zero, -nl::denorm_min(), neg_zero, false, 0); + TEST_SPECIAL(neg_zero, nl::min(), neg_zero, false, 0); + TEST_SPECIAL(neg_zero, -nl::min(), neg_zero, false, 0); + TEST_SPECIAL(neg_zero, nl::max(), neg_zero, false, 0); + TEST_SPECIAL(neg_zero, -nl::max(), neg_zero, false, 0); + + // fmod (+inf, y) == nl::quiet_NaN() plus invalid exception. + TEST_SPECIAL(inf, 3.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(inf, -1.1L, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(inf, 0.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(inf, neg_zero, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(inf, nl::denorm_min(), nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(inf, nl::min(), nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(inf, nl::max(), nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(inf, inf, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(inf, neg_inf, nl::quiet_NaN(), true, FE_INVALID); + + // fmod (-inf, y) == nl::quiet_NaN() plus invalid exception. + TEST_SPECIAL(neg_inf, 3.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_inf, -1.1L, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_inf, 0.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_inf, neg_zero, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_inf, nl::denorm_min(), nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_inf, nl::min(), nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_inf, nl::max(), nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_inf, inf, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_inf, neg_inf, nl::quiet_NaN(), true, FE_INVALID); + + // fmod (x, +0) == nl::quiet_NaN() plus invalid exception. + TEST_SPECIAL(3.0, 0.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(-1.1L, 0.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(0.0, 0.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_zero, 0.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(nl::denorm_min(), 0.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(nl::min(), 0.0, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(nl::max(), 0.0, nl::quiet_NaN(), true, FE_INVALID); + + // fmod (x, -0) == nl::quiet_NaN() plus invalid exception. + TEST_SPECIAL(3.0, neg_zero, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(-1.1L, neg_zero, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(0.0, neg_zero, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(neg_zero, neg_zero, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(nl::denorm_min(), neg_zero, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(nl::min(), neg_zero, nl::quiet_NaN(), true, FE_INVALID); + TEST_SPECIAL(nl::max(), neg_zero, nl::quiet_NaN(), true, FE_INVALID); + + // fmod (x, +inf) == x for x not infinite. + TEST_SPECIAL(0.0, inf, 0.0, false, 0); + TEST_SPECIAL(neg_zero, inf, neg_zero, false, 0); + TEST_SPECIAL(nl::denorm_min(), inf, nl::denorm_min(), false, 0); + TEST_SPECIAL(nl::min(), inf, nl::min(), false, 0); + TEST_SPECIAL(nl::max(), inf, nl::max(), false, 0); + TEST_SPECIAL(3.0, inf, 3.0, false, 0); + // fmod (x, -inf) == x for x not infinite. + TEST_SPECIAL(0.0, neg_inf, 0.0, false, 0); + TEST_SPECIAL(neg_zero, neg_inf, neg_zero, false, 0); + TEST_SPECIAL(nl::denorm_min(), neg_inf, nl::denorm_min(), false, 0); + TEST_SPECIAL(nl::min(), neg_inf, nl::min(), false, 0); + TEST_SPECIAL(nl::max(), neg_inf, nl::max(), false, 0); + TEST_SPECIAL(3.0, neg_inf, 3.0, false, 0); + + TEST_SPECIAL(0.0, nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(0.0, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(neg_zero, nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(neg_zero, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(1.0, nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(1.0, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(inf, nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(inf, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(neg_inf, nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(neg_inf, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(0.0, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(0.0, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(neg_zero, nl::signaling_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(neg_zero, -nl::signaling_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(1.0, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(1.0, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(inf, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(inf, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(neg_inf, nl::signaling_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(neg_inf, -nl::signaling_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(nl::quiet_NaN(), 0.0, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(-nl::quiet_NaN(), 0.0, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(nl::quiet_NaN(), neg_zero, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(-nl::quiet_NaN(), neg_zero, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(nl::quiet_NaN(), 1.0, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(-nl::quiet_NaN(), 1.0, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(nl::quiet_NaN(), inf, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(-nl::quiet_NaN(), inf, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(nl::quiet_NaN(), neg_inf, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(-nl::quiet_NaN(), neg_inf, nl::quiet_NaN(), false, 0); + TEST_SPECIAL(nl::signaling_NaN(), 0.0, nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), 0.0, nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(nl::signaling_NaN(), neg_zero, nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), neg_zero, nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(nl::signaling_NaN(), 1.0, nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), 1.0, nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(nl::signaling_NaN(), inf, nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), inf, nl::quiet_NaN(), false, FE_INVALID); + TEST_SPECIAL(nl::signaling_NaN(), neg_inf, nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), neg_inf, nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(nl::quiet_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(nl::quiet_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(-nl::quiet_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(-nl::quiet_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false, 0); + TEST_SPECIAL(nl::quiet_NaN(), nl::signaling_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(nl::quiet_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(-nl::quiet_NaN(), nl::signaling_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(-nl::quiet_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(nl::signaling_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(nl::signaling_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false, + FE_INVALID); + TEST_SPECIAL(nl::signaling_NaN(), nl::signaling_NaN(), nl::quiet_NaN(), + false, FE_INVALID); + TEST_SPECIAL(nl::signaling_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(), + false, FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), nl::signaling_NaN(), nl::quiet_NaN(), + false, FE_INVALID); + TEST_SPECIAL(-nl::signaling_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(), + false, FE_INVALID); + + TEST_SPECIAL(6.5, 2.25L, 2.0L, false, 0); + TEST_SPECIAL(-6.5, 2.25L, -2.0L, false, 0); + TEST_SPECIAL(6.5, -2.25L, 2.0L, false, 0); + TEST_SPECIAL(-6.5, -2.25L, -2.0L, false, 0); + + TEST_SPECIAL(nl::max(), nl::max(), 0.0, false, 0); + TEST_SPECIAL(nl::max(), -nl::max(), 0.0, false, 0); + TEST_SPECIAL(nl::max(), nl::min(), 0.0, false, 0); + TEST_SPECIAL(nl::max(), -nl::min(), 0.0, false, 0); + TEST_SPECIAL(nl::max(), nl::denorm_min(), 0.0, false, 0); + TEST_SPECIAL(nl::max(), -nl::denorm_min(), 0.0, false, 0); + TEST_SPECIAL(-nl::max(), nl::max(), neg_zero, false, 0); + TEST_SPECIAL(-nl::max(), -nl::max(), neg_zero, false, 0); + TEST_SPECIAL(-nl::max(), nl::min(), neg_zero, false, 0); + TEST_SPECIAL(-nl::max(), -nl::min(), neg_zero, false, 0); + TEST_SPECIAL(-nl::max(), nl::denorm_min(), neg_zero, false, 0); + TEST_SPECIAL(-nl::max(), -nl::denorm_min(), neg_zero, false, 0); + + TEST_SPECIAL(nl::min(), nl::max(), nl::min(), false, 0); + TEST_SPECIAL(nl::min(), -nl::max(), nl::min(), false, 0); + TEST_SPECIAL(nl::min(), nl::min(), 0.0, false, 0); + TEST_SPECIAL(nl::min(), -nl::min(), 0.0, false, 0); + TEST_SPECIAL(nl::min(), nl::denorm_min(), 0.0, false, 0); + TEST_SPECIAL(nl::min(), -nl::denorm_min(), 0.0, false, 0); + TEST_SPECIAL(-nl::min(), nl::max(), -nl::min(), false, 0); + TEST_SPECIAL(-nl::min(), -nl::max(), -nl::min(), false, 0); + TEST_SPECIAL(-nl::min(), nl::min(), neg_zero, false, 0); + TEST_SPECIAL(-nl::min(), -nl::min(), neg_zero, false, 0); + TEST_SPECIAL(-nl::min(), nl::denorm_min(), neg_zero, false, 0); + TEST_SPECIAL(-nl::min(), -nl::denorm_min(), neg_zero, false, 0); + + TEST_SPECIAL(nl::denorm_min(), nl::max(), nl::denorm_min(), false, 0); + TEST_SPECIAL(nl::denorm_min(), -nl::max(), nl::denorm_min(), false, 0); + TEST_SPECIAL(nl::denorm_min(), nl::min(), nl::denorm_min(), false, 0); + TEST_SPECIAL(nl::denorm_min(), -nl::min(), nl::denorm_min(), false, 0); + TEST_SPECIAL(nl::denorm_min(), nl::denorm_min(), 0.0, false, 0); + TEST_SPECIAL(nl::denorm_min(), -nl::denorm_min(), 0.0, false, 0); + TEST_SPECIAL(-nl::denorm_min(), nl::max(), -nl::denorm_min(), false, 0); + TEST_SPECIAL(-nl::denorm_min(), -nl::max(), -nl::denorm_min(), false, 0); + TEST_SPECIAL(-nl::denorm_min(), nl::min(), -nl::denorm_min(), false, 0); + TEST_SPECIAL(-nl::denorm_min(), -nl::min(), -nl::denorm_min(), false, 0); + TEST_SPECIAL(-nl::denorm_min(), nl::denorm_min(), neg_zero, false, 0); + TEST_SPECIAL(-nl::denorm_min(), -nl::denorm_min(), neg_zero, false, 0); + } + + void testRegularExtreme(FModFunc f) { + + TEST_REGULAR(0x1p127L, 0x3p-149L, 0x1p-149L); + TEST_REGULAR(0x1p127L, -0x3p-149L, 0x1p-149L); + TEST_REGULAR(0x1p127L, 0x3p-148L, 0x1p-147L); + TEST_REGULAR(0x1p127L, -0x3p-148L, 0x1p-147L); + TEST_REGULAR(0x1p127L, 0x3p-126L, 0x1p-125L); + TEST_REGULAR(0x1p127L, -0x3p-126L, 0x1p-125L); + TEST_REGULAR(-0x1p127L, 0x3p-149L, -0x1p-149L); + TEST_REGULAR(-0x1p127L, -0x3p-149L, -0x1p-149L); + TEST_REGULAR(-0x1p127L, 0x3p-148L, -0x1p-147L); + TEST_REGULAR(-0x1p127L, -0x3p-148L, -0x1p-147L); + TEST_REGULAR(-0x1p127L, 0x3p-126L, -0x1p-125L); + TEST_REGULAR(-0x1p127L, -0x3p-126L, -0x1p-125L); + + if constexpr (sizeof(T) >= sizeof(double)) { + TEST_REGULAR(0x1p1023L, 0x3p-1074L, 0x1p-1073L); + TEST_REGULAR(0x1p1023L, -0x3p-1074L, 0x1p-1073L); + TEST_REGULAR(0x1p1023L, 0x3p-1073L, 0x1p-1073L); + TEST_REGULAR(0x1p1023L, -0x3p-1073L, 0x1p-1073L); + TEST_REGULAR(0x1p1023L, 0x3p-1022L, 0x1p-1021L); + TEST_REGULAR(0x1p1023L, -0x3p-1022L, 0x1p-1021L); + TEST_REGULAR(-0x1p1023L, 0x3p-1074L, -0x1p-1073L); + TEST_REGULAR(-0x1p1023L, -0x3p-1074L, -0x1p-1073L); + TEST_REGULAR(-0x1p1023L, 0x3p-1073L, -0x1p-1073L); + TEST_REGULAR(-0x1p1023L, -0x3p-1073L, -0x1p-1073L); + TEST_REGULAR(-0x1p1023L, 0x3p-1022L, -0x1p-1021L); + TEST_REGULAR(-0x1p1023L, -0x3p-1022L, -0x1p-1021L); + } + } +}; + +#define LIST_FMOD_TESTS(T, func) \ + using LlvmLibcFmodTest = FmodTest; \ + TEST_F(LlvmLibcFmodTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcFmodTest, RegularExtreme) { testRegularExtreme(&func); } + +#endif // LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H diff --git a/libc/test/src/math/smoke/FloorTest.h b/libc/test/src/math/smoke/FloorTest.h new file mode 100644 index 0000000000000..b1125b0517e5e --- /dev/null +++ b/libc/test/src/math/smoke/FloorTest.h @@ -0,0 +1,68 @@ +//===-- Utility class to test floor[f|l] ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class FloorTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*FloorFunc)(T); + + void testSpecialNumbers(FloorFunc func) { + EXPECT_FP_EQ(zero, func(zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero)); + + EXPECT_FP_EQ(inf, func(inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf)); + + EXPECT_FP_EQ(aNaN, func(aNaN)); + } + + void testRoundedNumbers(FloorFunc func) { + EXPECT_FP_EQ(T(1.0), func(T(1.0))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.0))); + EXPECT_FP_EQ(T(10.0), func(T(10.0))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.0))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.0))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.0))); + } + + void testFractions(FloorFunc func) { + EXPECT_FP_EQ(T(0.0), func(T(0.5))); + EXPECT_FP_EQ(T(-1.0), func(T(-0.5))); + EXPECT_FP_EQ(T(0.0), func(T(0.115))); + EXPECT_FP_EQ(T(-1.0), func(T(-0.115))); + EXPECT_FP_EQ(T(0.0), func(T(0.715))); + EXPECT_FP_EQ(T(-1.0), func(T(-0.715))); + EXPECT_FP_EQ(T(1.0), func(T(1.3))); + EXPECT_FP_EQ(T(-2.0), func(T(-1.3))); + EXPECT_FP_EQ(T(1.0), func(T(1.5))); + EXPECT_FP_EQ(T(-2.0), func(T(-1.5))); + EXPECT_FP_EQ(T(1.0), func(T(1.75))); + EXPECT_FP_EQ(T(-2.0), func(T(-1.75))); + EXPECT_FP_EQ(T(10.0), func(T(10.32))); + EXPECT_FP_EQ(T(-11.0), func(T(-10.32))); + EXPECT_FP_EQ(T(10.0), func(T(10.65))); + EXPECT_FP_EQ(T(-11.0), func(T(-10.65))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.38))); + EXPECT_FP_EQ(T(-1235.0), func(T(-1234.38))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.96))); + EXPECT_FP_EQ(T(-1235.0), func(T(-1234.96))); + } +}; + +#define LIST_FLOOR_TESTS(T, func) \ + using LlvmLibcFloorTest = FloorTest; \ + TEST_F(LlvmLibcFloorTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcFloorTest, RoundedNubmers) { testRoundedNumbers(&func); } \ + TEST_F(LlvmLibcFloorTest, Fractions) { testFractions(&func); } diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h new file mode 100644 index 0000000000000..d276dba4291a6 --- /dev/null +++ b/libc/test/src/math/smoke/FmaTest.h @@ -0,0 +1,57 @@ +//===-- Utility class to test different flavors of fma --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_FMATEST_H +#define LLVM_LIBC_TEST_SRC_MATH_FMATEST_H + +#include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +template +class FmaTestTemplate : public __llvm_libc::testing::Test { +private: + using Func = T (*)(T, T, T); + using FPBits = __llvm_libc::fputil::FPBits; + using UIntType = typename FPBits::UIntType; + const T nan = T(__llvm_libc::fputil::FPBits::build_quiet_nan(1)); + const T inf = T(__llvm_libc::fputil::FPBits::inf()); + const T neg_inf = T(__llvm_libc::fputil::FPBits::neg_inf()); + const T zero = T(__llvm_libc::fputil::FPBits::zero()); + const T neg_zero = T(__llvm_libc::fputil::FPBits::neg_zero()); + +public: + void test_special_numbers(Func func) { + EXPECT_FP_EQ(func(zero, zero, zero), zero); + EXPECT_FP_EQ(func(zero, neg_zero, neg_zero), neg_zero); + EXPECT_FP_EQ(func(inf, inf, zero), inf); + EXPECT_FP_EQ(func(neg_inf, inf, neg_inf), neg_inf); + EXPECT_FP_EQ(func(inf, zero, zero), nan); + EXPECT_FP_EQ(func(inf, neg_inf, inf), nan); + EXPECT_FP_EQ(func(nan, zero, inf), nan); + EXPECT_FP_EQ(func(inf, neg_inf, nan), nan); + + // Test underflow rounding up. + EXPECT_FP_EQ(func(T(0.5), T(FPBits(FPBits::MIN_SUBNORMAL)), + T(FPBits(FPBits::MIN_SUBNORMAL))), + T(FPBits(UIntType(2)))); + // Test underflow rounding down. + T v = T(FPBits(FPBits::MIN_NORMAL + UIntType(1))); + EXPECT_FP_EQ(func(T(1) / T(FPBits::MIN_NORMAL << 1), v, + T(FPBits(FPBits::MIN_NORMAL))), + v); + // Test overflow. + T z = T(FPBits(FPBits::MAX_NORMAL)); + EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z); + // Exact cancellation. + EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0)); + EXPECT_FP_EQ(func(T(-3.0), T(5.0), T(15.0)), T(0.0)); + } +}; + +#endif // LLVM_LIBC_TEST_SRC_MATH_FMATEST_H diff --git a/libc/test/src/math/smoke/FrexpTest.h b/libc/test/src/math/smoke/FrexpTest.h new file mode 100644 index 0000000000000..7a3a948912bb8 --- /dev/null +++ b/libc/test/src/math/smoke/FrexpTest.h @@ -0,0 +1,96 @@ +//===-- Utility class to test frexp[f|l] ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/BasicOperations.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class FrexpTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + + static constexpr UIntType HIDDEN_BIT = + UIntType(1) << __llvm_libc::fputil::MantissaWidth::VALUE; + +public: + typedef T (*FrexpFunc)(T, int *); + + void testSpecialNumbers(FrexpFunc func) { + int exponent; + ASSERT_FP_EQ(aNaN, func(aNaN, &exponent)); + ASSERT_FP_EQ(inf, func(inf, &exponent)); + ASSERT_FP_EQ(neg_inf, func(neg_inf, &exponent)); + + ASSERT_FP_EQ(0.0, func(0.0, &exponent)); + ASSERT_EQ(exponent, 0); + + ASSERT_FP_EQ(-0.0, func(-0.0, &exponent)); + ASSERT_EQ(exponent, 0); + } + + void testPowersOfTwo(FrexpFunc func) { + int exponent; + + EXPECT_FP_EQ(T(0.5), func(T(1.0), &exponent)); + EXPECT_EQ(exponent, 1); + EXPECT_FP_EQ(T(-0.5), func(T(-1.0), &exponent)); + EXPECT_EQ(exponent, 1); + + EXPECT_FP_EQ(T(0.5), func(T(2.0), &exponent)); + EXPECT_EQ(exponent, 2); + EXPECT_FP_EQ(T(-0.5), func(T(-2.0), &exponent)); + EXPECT_EQ(exponent, 2); + + EXPECT_FP_EQ(T(0.5), func(T(4.0), &exponent)); + EXPECT_EQ(exponent, 3); + EXPECT_FP_EQ(T(-0.5), func(T(-4.0), &exponent)); + EXPECT_EQ(exponent, 3); + + EXPECT_FP_EQ(T(0.5), func(T(8.0), &exponent)); + EXPECT_EQ(exponent, 4); + EXPECT_FP_EQ(T(-0.5), func(T(-8.0), &exponent)); + EXPECT_EQ(exponent, 4); + + EXPECT_FP_EQ(T(0.5), func(T(16.0), &exponent)); + EXPECT_EQ(exponent, 5); + EXPECT_FP_EQ(T(-0.5), func(T(-16.0), &exponent)); + EXPECT_EQ(exponent, 5); + + EXPECT_FP_EQ(T(0.5), func(T(32.0), &exponent)); + EXPECT_EQ(exponent, 6); + EXPECT_FP_EQ(T(-0.5), func(T(-32.0), &exponent)); + EXPECT_EQ(exponent, 6); + } + + void testSomeIntegers(FrexpFunc func) { + int exponent; + + EXPECT_FP_EQ(T(0.75), func(T(24.0), &exponent)); + EXPECT_EQ(exponent, 5); + EXPECT_FP_EQ(T(-0.75), func(T(-24.0), &exponent)); + EXPECT_EQ(exponent, 5); + + EXPECT_FP_EQ(T(0.625), func(T(40.0), &exponent)); + EXPECT_EQ(exponent, 6); + EXPECT_FP_EQ(T(-0.625), func(T(-40.0), &exponent)); + EXPECT_EQ(exponent, 6); + + EXPECT_FP_EQ(T(0.78125), func(T(800.0), &exponent)); + EXPECT_EQ(exponent, 10); + EXPECT_FP_EQ(T(-0.78125), func(T(-800.0), &exponent)); + EXPECT_EQ(exponent, 10); + } +}; + +#define LIST_FREXP_TESTS(T, func) \ + using LlvmLibcFrexpTest = FrexpTest; \ + TEST_F(LlvmLibcFrexpTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcFrexpTest, PowersOfTwo) { testPowersOfTwo(&func); } \ + TEST_F(LlvmLibcFrexpTest, SomeIntegers) { testSomeIntegers(&func); }\ diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h new file mode 100644 index 0000000000000..084dbc4d83d75 --- /dev/null +++ b/libc/test/src/math/smoke/HypotTest.h @@ -0,0 +1,60 @@ +//===-- Utility class to test different flavors of hypot ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_HYPOTTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_HYPOTTEST_H + +#include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template +class HypotTestTemplate : public __llvm_libc::testing::Test { +private: + using Func = T (*)(T, T); + using FPBits = __llvm_libc::fputil::FPBits; + using UIntType = typename FPBits::UIntType; + const T nan = T(FPBits::build_quiet_nan(1)); + const T inf = T(FPBits::inf()); + const T neg_inf = T(FPBits::neg_inf()); + const T zero = T(FPBits::zero()); + const T neg_zero = T(FPBits::neg_zero()); + const T max_normal = T(FPBits(FPBits::MAX_NORMAL)); + const T min_normal = T(FPBits(FPBits::MIN_NORMAL)); + const T max_subnormal = T(FPBits(FPBits::MAX_SUBNORMAL)); + const T min_subnormal = T(FPBits(FPBits::MIN_SUBNORMAL)); + +public: + void test_special_numbers(Func func) { + constexpr int N = 4; + // Pythagorean triples. + constexpr T PYT[N][3] = {{3, 4, 5}, {5, 12, 13}, {8, 15, 17}, {7, 24, 25}}; + + EXPECT_FP_EQ(func(inf, nan), inf); + EXPECT_FP_EQ(func(nan, neg_inf), inf); + EXPECT_FP_EQ(func(nan, nan), nan); + EXPECT_FP_EQ(func(nan, zero), nan); + EXPECT_FP_EQ(func(neg_zero, nan), nan); + + for (int i = 0; i < N; ++i) { + EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(PYT[i][0], PYT[i][1])); + EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(-PYT[i][0], PYT[i][1])); + EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(PYT[i][0], -PYT[i][1])); + EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(-PYT[i][0], -PYT[i][1])); + + EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(PYT[i][1], PYT[i][0])); + EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(-PYT[i][1], PYT[i][0])); + EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(PYT[i][1], -PYT[i][0])); + EXPECT_FP_EQ_ALL_ROUNDING(PYT[i][2], func(-PYT[i][1], -PYT[i][0])); + } + } +}; + +#endif // LLVM_LIBC_TEST_SRC_MATH_HYPOTTEST_H diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h new file mode 100644 index 0000000000000..6ad5eb065c8aa --- /dev/null +++ b/libc/test/src/math/smoke/ILogbTest.h @@ -0,0 +1,115 @@ +//===-- Utility class to test different flavors of ilogb --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H + +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "test/UnitTest/Test.h" +#include + +#include + +class LlvmLibcILogbTest : public __llvm_libc::testing::Test { +public: + template struct ILogbFunc { + typedef int (*Func)(T); + }; + + template + void test_special_numbers(typename ILogbFunc::Func func) { + EXPECT_EQ(FP_ILOGB0, func(T(__llvm_libc::fputil::FPBits::zero()))); + EXPECT_EQ(FP_ILOGB0, func(T(__llvm_libc::fputil::FPBits::neg_zero()))); + + EXPECT_EQ(FP_ILOGBNAN, + func(T(__llvm_libc::fputil::FPBits::build_quiet_nan(1)))); + + EXPECT_EQ(INT_MAX, func(T(__llvm_libc::fputil::FPBits::inf()))); + EXPECT_EQ(INT_MAX, func(T(__llvm_libc::fputil::FPBits::neg_inf()))); + } + + template + void test_powers_of_two(typename ILogbFunc::Func func) { + EXPECT_EQ(0, func(T(1.0))); + EXPECT_EQ(0, func(T(-1.0))); + + EXPECT_EQ(1, func(T(2.0))); + EXPECT_EQ(1, func(T(-2.0))); + + EXPECT_EQ(2, func(T(4.0))); + EXPECT_EQ(2, func(T(-4.0))); + + EXPECT_EQ(3, func(T(8.0))); + EXPECT_EQ(3, func(-8.0)); + + EXPECT_EQ(4, func(16.0)); + EXPECT_EQ(4, func(-16.0)); + + EXPECT_EQ(5, func(32.0)); + EXPECT_EQ(5, func(-32.0)); + } + + template + void test_some_integers(typename ILogbFunc::Func func) { + EXPECT_EQ(1, func(T(3.0))); + EXPECT_EQ(1, func(T(-3.0))); + + EXPECT_EQ(2, func(T(7.0))); + EXPECT_EQ(2, func(T(-7.0))); + + EXPECT_EQ(3, func(T(10.0))); + EXPECT_EQ(3, func(T(-10.0))); + + EXPECT_EQ(4, func(T(31.0))); + EXPECT_EQ(4, func(-31.0)); + + EXPECT_EQ(5, func(55.0)); + EXPECT_EQ(5, func(-55.0)); + } + + template + void test_subnormal_range(typename ILogbFunc::Func func) { + using FPBits = __llvm_libc::fputil::FPBits; + using UIntType = typename FPBits::UIntType; + constexpr UIntType COUNT = 10'001; + constexpr UIntType STEP = + (UIntType(FPBits::MAX_SUBNORMAL) - UIntType(FPBits::MIN_SUBNORMAL)) / + COUNT; + for (UIntType v = FPBits::MIN_SUBNORMAL; v <= FPBits::MAX_SUBNORMAL; + v += STEP) { + T x = T(FPBits(v)); + if (isnan(x) || isinf(x) || x == 0.0) + continue; + + int exponent; + __llvm_libc::fputil::frexp(x, exponent); + ASSERT_EQ(exponent, func(x) + 1); + } + } + + template + void test_normal_range(typename ILogbFunc::Func func) { + using FPBits = __llvm_libc::fputil::FPBits; + using UIntType = typename FPBits::UIntType; + constexpr UIntType COUNT = 10'001; + constexpr UIntType STEP = + (UIntType(FPBits::MAX_NORMAL) - UIntType(FPBits::MIN_NORMAL)) / COUNT; + for (UIntType v = FPBits::MIN_NORMAL; v <= FPBits::MAX_NORMAL; v += STEP) { + T x = T(FPBits(v)); + if (isnan(x) || isinf(x) || x == 0.0) + continue; + + int exponent; + __llvm_libc::fputil::frexp(x, exponent); + ASSERT_EQ(exponent, func(x) + 1); + } + } +}; + +#endif // LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H diff --git a/libc/test/src/math/smoke/LdExpTest.h b/libc/test/src/math/smoke/LdExpTest.h new file mode 100644 index 0000000000000..2e592995ae792 --- /dev/null +++ b/libc/test/src/math/smoke/LdExpTest.h @@ -0,0 +1,166 @@ +//===-- Utility class to test different flavors of ldexp --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_LDEXPTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_LDEXPTEST_H + +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/NormalFloat.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include +#include +#include + +template +class LdExpTestTemplate : public __llvm_libc::testing::Test { + using FPBits = __llvm_libc::fputil::FPBits; + using NormalFloat = __llvm_libc::fputil::NormalFloat; + using UIntType = typename FPBits::UIntType; + static constexpr UIntType MANTISSA_WIDTH = + __llvm_libc::fputil::MantissaWidth::VALUE; + // A normalized mantissa to be used with tests. + static constexpr UIntType MANTISSA = NormalFloat::ONE + 0x1234; + + const T zero = T(__llvm_libc::fputil::FPBits::zero()); + const T neg_zero = T(__llvm_libc::fputil::FPBits::neg_zero()); + const T inf = T(__llvm_libc::fputil::FPBits::inf()); + const T neg_inf = T(__llvm_libc::fputil::FPBits::neg_inf()); + const T nan = T(__llvm_libc::fputil::FPBits::build_quiet_nan(1)); + +public: + typedef T (*LdExpFunc)(T, int); + + void testSpecialNumbers(LdExpFunc func) { + int exp_array[5] = {-INT_MAX - 1, -10, 0, 10, INT_MAX}; + for (int exp : exp_array) { + ASSERT_FP_EQ(zero, func(zero, exp)); + ASSERT_FP_EQ(neg_zero, func(neg_zero, exp)); + ASSERT_FP_EQ(inf, func(inf, exp)); + ASSERT_FP_EQ(neg_inf, func(neg_inf, exp)); + ASSERT_FP_EQ(nan, func(nan, exp)); + } + } + + void testPowersOfTwo(LdExpFunc func) { + int32_t exp_array[5] = {1, 2, 3, 4, 5}; + int32_t val_array[6] = {1, 2, 4, 8, 16, 32}; + for (int32_t exp : exp_array) { + for (int32_t val : val_array) { + ASSERT_FP_EQ(T(val << exp), func(T(val), exp)); + ASSERT_FP_EQ(T(-1 * (val << exp)), func(T(-val), exp)); + } + } + } + + void testOverflow(LdExpFunc func) { + NormalFloat x(FPBits::MAX_EXPONENT - 10, NormalFloat::ONE + 0xF00BA, 0); + for (int32_t exp = 10; exp < 100; ++exp) { + ASSERT_FP_EQ(inf, func(T(x), exp)); + ASSERT_FP_EQ(neg_inf, func(-T(x), exp)); + } + } + + void testUnderflowToZeroOnNormal(LdExpFunc func) { + // In this test, we pass a normal nubmer to func and expect zero + // to be returned due to underflow. + int32_t base_exponent = FPBits::EXPONENT_BIAS + int32_t(MANTISSA_WIDTH); + int32_t exp_array[] = {base_exponent + 5, base_exponent + 4, + base_exponent + 3, base_exponent + 2, + base_exponent + 1}; + T x = NormalFloat(0, MANTISSA, 0); + for (int32_t exp : exp_array) { + ASSERT_FP_EQ(func(x, -exp), x > 0 ? zero : neg_zero); + } + } + + void testUnderflowToZeroOnSubnormal(LdExpFunc func) { + // In this test, we pass a normal nubmer to func and expect zero + // to be returned due to underflow. + int32_t base_exponent = FPBits::EXPONENT_BIAS + int32_t(MANTISSA_WIDTH); + int32_t exp_array[] = {base_exponent + 5, base_exponent + 4, + base_exponent + 3, base_exponent + 2, + base_exponent + 1}; + T x = NormalFloat(-FPBits::EXPONENT_BIAS, MANTISSA, 0); + for (int32_t exp : exp_array) { + ASSERT_FP_EQ(func(x, -exp), x > 0 ? zero : neg_zero); + } + } + + void testNormalOperation(LdExpFunc func) { + T val_array[] = { + // Normal numbers + NormalFloat(100, MANTISSA, 0), NormalFloat(-100, MANTISSA, 0), + NormalFloat(100, MANTISSA, 1), NormalFloat(-100, MANTISSA, 1), + // Subnormal numbers + NormalFloat(-FPBits::EXPONENT_BIAS, MANTISSA, 0), + NormalFloat(-FPBits::EXPONENT_BIAS, MANTISSA, 1)}; + for (int32_t exp = 0; exp <= static_cast(MANTISSA_WIDTH); ++exp) { + for (T x : val_array) { + // We compare the result of ldexp with the result + // of the native multiplication/division instruction. + + // We need to use a NormalFloat here (instead of 1 << exp), because + // there are 32 bit systems that don't support 128bit long ints but + // support long doubles. This test can do 1 << 64, which would fail + // in these systems. + NormalFloat two_to_exp = NormalFloat(static_cast(1.L)); + two_to_exp = two_to_exp.mul2(exp); + + ASSERT_FP_EQ(func(x, exp), x * two_to_exp); + ASSERT_FP_EQ(func(x, -exp), x / two_to_exp); + } + } + + // Normal which trigger mantissa overflow. + T x = NormalFloat(-FPBits::EXPONENT_BIAS + 1, + UIntType(2) * NormalFloat::ONE - UIntType(1), 0); + ASSERT_FP_EQ(func(x, -1), x / 2); + ASSERT_FP_EQ(func(-x, -1), -x / 2); + + // Start with a normal number high exponent but pass a very low number for + // exp. The result should be a subnormal number. + x = NormalFloat(FPBits::EXPONENT_BIAS, NormalFloat::ONE, 0); + int exp = -FPBits::MAX_EXPONENT - 5; + T result = func(x, exp); + FPBits result_bits(result); + ASSERT_FALSE(result_bits.is_zero()); + // Verify that the result is indeed subnormal. + ASSERT_EQ(result_bits.get_unbiased_exponent(), uint16_t(0)); + // But if the exp is so less that normalization leads to zero, then + // the result should be zero. + result = func(x, -FPBits::MAX_EXPONENT - int(MANTISSA_WIDTH) - 5); + ASSERT_TRUE(FPBits(result).is_zero()); + + // Start with a subnormal number but pass a very high number for exponent. + // The result should not be infinity. + x = NormalFloat(-FPBits::EXPONENT_BIAS + 1, NormalFloat::ONE >> 10, 0); + exp = FPBits::MAX_EXPONENT + 5; + ASSERT_FALSE(FPBits(func(x, exp)).is_inf()); + // But if the exp is large enough to oversome than the normalization shift, + // then it should result in infinity. + exp = FPBits::MAX_EXPONENT + 15; + ASSERT_FP_EQ(func(x, exp), inf); + } +}; + +#define LIST_LDEXP_TESTS(T, func) \ + using LlvmLibcLdExpTest = LdExpTestTemplate; \ + TEST_F(LlvmLibcLdExpTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcLdExpTest, PowersOfTwo) { testPowersOfTwo(&func); } \ + TEST_F(LlvmLibcLdExpTest, OverFlow) { testOverflow(&func); } \ + TEST_F(LlvmLibcLdExpTest, UnderflowToZeroOnNormal) { \ + testUnderflowToZeroOnNormal(&func); \ + } \ + TEST_F(LlvmLibcLdExpTest, UnderflowToZeroOnSubnormal) { \ + testUnderflowToZeroOnSubnormal(&func); \ + } \ + TEST_F(LlvmLibcLdExpTest, NormalOperation) { testNormalOperation(&func); } + +#endif // LLVM_LIBC_TEST_SRC_MATH_LDEXPTEST_H diff --git a/libc/test/src/math/smoke/LogbTest.h b/libc/test/src/math/smoke/LogbTest.h new file mode 100644 index 0000000000000..885bd7c8a6371 --- /dev/null +++ b/libc/test/src/math/smoke/LogbTest.h @@ -0,0 +1,91 @@ +//===-- Utility class to test logb[f|l] -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class LogbTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + + static constexpr UIntType HIDDEN_BIT = + UIntType(1) << __llvm_libc::fputil::MantissaWidth::VALUE; + +public: + typedef T (*LogbFunc)(T); + + void testSpecialNumbers(LogbFunc func) { + ASSERT_FP_EQ(aNaN, func(aNaN)); + ASSERT_FP_EQ(inf, func(inf)); + ASSERT_FP_EQ(inf, func(neg_inf)); + ASSERT_FP_EQ(neg_inf, func(0.0)); + ASSERT_FP_EQ(neg_inf, func(-0.0)); + } + + void testPowersOfTwo(LogbFunc func) { + EXPECT_FP_EQ(T(0.0), func(T(1.0))); + EXPECT_FP_EQ(T(0.0), func(T(-1.0))); + + EXPECT_FP_EQ(T(1.0), func(T(2.0))); + EXPECT_FP_EQ(T(1.0), func(T(-2.0))); + + EXPECT_FP_EQ(T(2.0), func(T(4.0))); + EXPECT_FP_EQ(T(2.0), func(T(-4.0))); + + EXPECT_FP_EQ(T(3.0), func(T(8.0))); + EXPECT_FP_EQ(T(3.0), func(T(-8.0))); + + EXPECT_FP_EQ(T(4.0), func(T(16.0))); + EXPECT_FP_EQ(T(4.0), func(T(-16.0))); + + EXPECT_FP_EQ(T(5.0), func(T(32.0))); + EXPECT_FP_EQ(T(5.0), func(T(-32.0))); + } + + void testSomeIntegers(LogbFunc func) { + EXPECT_FP_EQ(T(1.0), func(T(3.0))); + EXPECT_FP_EQ(T(1.0), func(T(-3.0))); + + EXPECT_FP_EQ(T(2.0), func(T(7.0))); + EXPECT_FP_EQ(T(2.0), func(T(-7.0))); + + EXPECT_FP_EQ(T(3.0), func(T(10.0))); + EXPECT_FP_EQ(T(3.0), func(T(-10.0))); + + EXPECT_FP_EQ(T(4.0), func(T(31.0))); + EXPECT_FP_EQ(T(4.0), func(T(-31.0))); + + EXPECT_FP_EQ(T(5.0), func(T(55.0))); + EXPECT_FP_EQ(T(5.0), func(T(-55.0))); + } + + void testRange(LogbFunc func) { + using UIntType = typename FPBits::UIntType; + constexpr UIntType COUNT = 100'000; + constexpr UIntType STEP = UIntType(-1) / COUNT; + for (UIntType i = 0, v = 0; i <= COUNT; ++i, v += STEP) { + T x = static_cast(FPBits(v)); + if (isnan(x) || isinf(x) || x == 0.0l) + continue; + + int exponent; + __llvm_libc::fputil::frexp(x, exponent); + ASSERT_FP_EQ(T(exponent), func(x) + T(1.0)); + } + } +}; + +#define LIST_LOGB_TESTS(T, func) \ + using LlvmLibcLogbTest = LogbTest; \ + TEST_F(LlvmLibcLogbTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcLogbTest, PowersOfTwo) { testPowersOfTwo(&func); } \ + TEST_F(LlvmLibcLogbTest, SomeIntegers) { testSomeIntegers(&func); } \ + TEST_F(LlvmLibcLogbTest, InRange) { testRange(&func); } diff --git a/libc/test/src/math/smoke/ModfTest.h b/libc/test/src/math/smoke/ModfTest.h new file mode 100644 index 0000000000000..687d0dd74ab67 --- /dev/null +++ b/libc/test/src/math/smoke/ModfTest.h @@ -0,0 +1,105 @@ +//===-- Utility class to test floor[f|l] ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/NearestIntegerOperations.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class ModfTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*ModfFunc)(T, T *); + + void testSpecialNumbers(ModfFunc func) { + T integral; + + EXPECT_FP_EQ(zero, func(zero, &integral)); + EXPECT_FP_EQ(integral, zero); + EXPECT_FP_EQ(neg_zero, func(neg_zero, &integral)); + EXPECT_FP_EQ(integral, neg_zero); + + EXPECT_FP_EQ(zero, func(inf, &integral)); + EXPECT_FP_EQ(inf, integral); + EXPECT_FP_EQ(neg_zero, func(neg_inf, &integral)); + EXPECT_FP_EQ(neg_inf, integral); + + EXPECT_FP_EQ(aNaN, func(aNaN, &integral)); + } + + void testIntegers(ModfFunc func) { + T integral; + + EXPECT_FP_EQ(T(0.0), func(T(1.0), &integral)); + EXPECT_FP_EQ(T(1.0), integral); + + EXPECT_FP_EQ(T(-0.0), func(T(-1.0), &integral)); + EXPECT_FP_EQ(T(-1.0), integral); + + EXPECT_FP_EQ(T(0.0), func(T(10.0), &integral)); + EXPECT_FP_EQ(T(10.0), integral); + + EXPECT_FP_EQ(T(-0.0), func(T(-10.0), &integral)); + EXPECT_FP_EQ(T(-10.0), integral); + + EXPECT_FP_EQ(T(0.0), func(T(12345.0), &integral)); + EXPECT_FP_EQ(T(12345.0), integral); + + EXPECT_FP_EQ(T(-0.0), func(T(-12345.0), &integral)); + EXPECT_FP_EQ(T(-12345.0), integral); + } + + void testFractions(ModfFunc func) { + T integral; + + EXPECT_FP_EQ(T(0.5), func(T(1.5), &integral)); + EXPECT_FP_EQ(integral, T(1.0)); + + EXPECT_FP_EQ(T(-0.5), func(T(-1.5), &integral)); + EXPECT_FP_EQ(integral, T(-1.0)); + + EXPECT_FP_EQ(T(0.75), func(T(10.75), &integral)); + EXPECT_FP_EQ(integral, T(10.0)); + + EXPECT_FP_EQ(T(-0.75), func(T(-10.75), &integral)); + EXPECT_FP_EQ(integral, T(-10.0)); + + EXPECT_FP_EQ(T(0.125), func(T(100.125), &integral)); + EXPECT_FP_EQ(integral, T(100.0)); + + EXPECT_FP_EQ(T(-0.125), func(T(-100.125), &integral)); + EXPECT_FP_EQ(integral, T(-100.0)); + } + + void testRange(ModfFunc func) { + constexpr UIntType COUNT = 100'000; + constexpr UIntType STEP = UIntType(-1) / COUNT; + for (UIntType i = 0, v = 0; i <= COUNT; ++i, v += STEP) { + T x = T(FPBits(v)); + if (isnan(x) || isinf(x) || x == T(0.0)) + continue; + + T integral; + T frac = func(x, &integral); + ASSERT_TRUE(__llvm_libc::fputil::abs(frac) < 1.0l); + ASSERT_TRUE(__llvm_libc::fputil::trunc(x) == integral); + ASSERT_TRUE(integral + frac == x); + } + } +}; + +#define LIST_MODF_TESTS(T, func) \ + using LlvmLibcModfTest = ModfTest; \ + TEST_F(LlvmLibcModfTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcModfTest, RoundedNubmers) { testIntegers(&func); } \ + TEST_F(LlvmLibcModfTest, Fractions) { testFractions(&func); } \ + TEST_F(LlvmLibcModfTest, Range) { testRange(&func); } diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h new file mode 100644 index 0000000000000..e6de9419d9904 --- /dev/null +++ b/libc/test/src/math/smoke/NextAfterTest.h @@ -0,0 +1,198 @@ +//===-- Utility class to test different flavors of nextafter ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H + +#include "src/__support/CPP/bit.h" +#include "src/__support/CPP/type_traits.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +template +class NextAfterTestTemplate : public __llvm_libc::testing::Test { + using FPBits = __llvm_libc::fputil::FPBits; + using MantissaWidth = __llvm_libc::fputil::MantissaWidth; + using UIntType = typename FPBits::UIntType; + + static constexpr int BIT_WIDTH_OF_TYPE = + __llvm_libc::fputil::FloatProperties::BIT_WIDTH; + + const T zero = T(FPBits::zero()); + const T neg_zero = T(FPBits::neg_zero()); + const T inf = T(FPBits::inf()); + const T neg_inf = T(FPBits::neg_inf()); + const T nan = T(FPBits::build_quiet_nan(1)); + const UIntType min_subnormal = FPBits::MIN_SUBNORMAL; + const UIntType max_subnormal = FPBits::MAX_SUBNORMAL; + const UIntType min_normal = FPBits::MIN_NORMAL; + const UIntType max_normal = FPBits::MAX_NORMAL; + +public: + typedef T (*NextAfterFunc)(T, T); + + void testNaN(NextAfterFunc func) { + ASSERT_FP_EQ(func(nan, 0), nan); + ASSERT_FP_EQ(func(0, nan), nan); + } + + void testBoundaries(NextAfterFunc func) { + ASSERT_FP_EQ(func(zero, neg_zero), neg_zero); + ASSERT_FP_EQ(func(neg_zero, zero), zero); + + // 'from' is zero|neg_zero. + T x = zero; + T result = func(x, T(1)); + UIntType expected_bits = 1; + T expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + result = func(x, T(-1)); + expected_bits = (UIntType(1) << (BIT_WIDTH_OF_TYPE - 1)) + 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + x = neg_zero; + result = func(x, 1); + expected_bits = 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + result = func(x, -1); + expected_bits = (UIntType(1) << (BIT_WIDTH_OF_TYPE - 1)) + 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + // 'from' is max subnormal value. + x = __llvm_libc::cpp::bit_cast(max_subnormal); + result = func(x, 1); + expected = __llvm_libc::cpp::bit_cast(min_normal); + ASSERT_FP_EQ(result, expected); + + result = func(x, 0); + expected_bits = max_subnormal - 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + x = -x; + + result = func(x, -1); + expected_bits = (UIntType(1) << (BIT_WIDTH_OF_TYPE - 1)) + min_normal; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + result = func(x, 0); + expected_bits = + (UIntType(1) << (BIT_WIDTH_OF_TYPE - 1)) + max_subnormal - 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + // 'from' is min subnormal value. + x = __llvm_libc::cpp::bit_cast(min_subnormal); + result = func(x, 1); + expected_bits = min_subnormal + 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + ASSERT_FP_EQ(func(x, 0), 0); + + x = -x; + result = func(x, -1); + expected_bits = + (UIntType(1) << (BIT_WIDTH_OF_TYPE - 1)) + min_subnormal + 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + ASSERT_FP_EQ(func(x, 0), T(-0.0)); + + // 'from' is min normal. + x = __llvm_libc::cpp::bit_cast(min_normal); + result = func(x, 0); + expected_bits = max_subnormal; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + result = func(x, inf); + expected_bits = min_normal + 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + x = -x; + result = func(x, 0); + expected_bits = (UIntType(1) << (BIT_WIDTH_OF_TYPE - 1)) + max_subnormal; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + result = func(x, -inf); + expected_bits = (UIntType(1) << (BIT_WIDTH_OF_TYPE - 1)) + min_normal + 1; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + + // 'from' is max normal and 'to' is infinity. + x = __llvm_libc::cpp::bit_cast(max_normal); + result = func(x, inf); + ASSERT_FP_EQ(result, inf); + + result = func(-x, -inf); + ASSERT_FP_EQ(result, -inf); + + // 'from' is infinity. + x = inf; + result = func(x, 0); + expected_bits = max_normal; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + ASSERT_FP_EQ(func(x, inf), inf); + + x = neg_inf; + result = func(x, 0); + expected_bits = (UIntType(1) << (BIT_WIDTH_OF_TYPE - 1)) + max_normal; + expected = __llvm_libc::cpp::bit_cast(expected_bits); + ASSERT_FP_EQ(result, expected); + ASSERT_FP_EQ(func(x, neg_inf), neg_inf); + + // 'from' is a power of 2. + x = T(32.0); + result = func(x, 0); + FPBits x_bits = FPBits(x); + FPBits result_bits = FPBits(result); + ASSERT_EQ(result_bits.get_unbiased_exponent(), + uint16_t(x_bits.get_unbiased_exponent() - 1)); + ASSERT_EQ(result_bits.get_mantissa(), + (UIntType(1) << MantissaWidth::VALUE) - 1); + + result = func(x, T(33.0)); + result_bits = FPBits(result); + ASSERT_EQ(result_bits.get_unbiased_exponent(), + x_bits.get_unbiased_exponent()); + ASSERT_EQ(result_bits.get_mantissa(), x_bits.get_mantissa() + UIntType(1)); + + x = -x; + + result = func(x, 0); + result_bits = FPBits(result); + ASSERT_EQ(result_bits.get_unbiased_exponent(), + uint16_t(x_bits.get_unbiased_exponent() - 1)); + ASSERT_EQ(result_bits.get_mantissa(), + (UIntType(1) << MantissaWidth::VALUE) - 1); + + result = func(x, T(-33.0)); + result_bits = FPBits(result); + ASSERT_EQ(result_bits.get_unbiased_exponent(), + x_bits.get_unbiased_exponent()); + ASSERT_EQ(result_bits.get_mantissa(), x_bits.get_mantissa() + UIntType(1)); + } +}; + +#define LIST_NEXTAFTER_TESTS(T, func) \ + using LlvmLibcNextAfterTest = NextAfterTestTemplate; \ + TEST_F(LlvmLibcNextAfterTest, TestNaN) { testNaN(&func); } \ + TEST_F(LlvmLibcNextAfterTest, TestBoundaries) { testBoundaries(&func); } + +#endif // LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h new file mode 100644 index 0000000000000..7a6d5c7ed78c9 --- /dev/null +++ b/libc/test/src/math/smoke/RIntTest.h @@ -0,0 +1,56 @@ +//===-- Utility class to test different flavors of rint ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_RINTTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_RINTTEST_H + +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include +#include +#include + +static constexpr int ROUNDING_MODES[4] = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO, + FE_TONEAREST}; + +template +class RIntTestTemplate : public __llvm_libc::testing::Test { +public: + typedef T (*RIntFunc)(T); + +private: + using FPBits = __llvm_libc::fputil::FPBits; + using UIntType = typename FPBits::UIntType; + + const T zero = T(FPBits::zero()); + const T neg_zero = T(FPBits::neg_zero()); + const T inf = T(FPBits::inf()); + const T neg_inf = T(FPBits::neg_inf()); + const T nan = T(FPBits::build_quiet_nan(1)); + +public: + void testSpecialNumbers(RIntFunc func) { + for (int mode : ROUNDING_MODES) { + __llvm_libc::fputil::set_round(mode); + ASSERT_FP_EQ(inf, func(inf)); + ASSERT_FP_EQ(neg_inf, func(neg_inf)); + ASSERT_FP_EQ(nan, func(nan)); + ASSERT_FP_EQ(zero, func(zero)); + ASSERT_FP_EQ(neg_zero, func(neg_zero)); + } + } +}; + +#define LIST_RINT_TESTS(F, func) \ + using LlvmLibcRIntTest = RIntTestTemplate; \ + TEST_F(LlvmLibcRIntTest, specialNumbers) { testSpecialNumbers(&func); } + +#endif // LLVM_LIBC_TEST_SRC_MATH_RINTTEST_H diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h new file mode 100644 index 0000000000000..4d49fd7dedc40 --- /dev/null +++ b/libc/test/src/math/smoke/RemQuoTest.h @@ -0,0 +1,102 @@ +//===-- Utility class to test different flavors of remquo -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H + +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +template +class RemQuoTestTemplate : public __llvm_libc::testing::Test { + using FPBits = __llvm_libc::fputil::FPBits; + using UIntType = typename FPBits::UIntType; + + const T zero = T(__llvm_libc::fputil::FPBits::zero()); + const T neg_zero = T(__llvm_libc::fputil::FPBits::neg_zero()); + const T inf = T(__llvm_libc::fputil::FPBits::inf()); + const T neg_inf = T(__llvm_libc::fputil::FPBits::neg_inf()); + const T nan = T(__llvm_libc::fputil::FPBits::build_quiet_nan(1)); + +public: + typedef T (*RemQuoFunc)(T, T, int *); + + void testSpecialNumbers(RemQuoFunc func) { + int quotient; + T x, y; + + y = T(1.0); + x = inf; + EXPECT_FP_EQ(nan, func(x, y, "ient)); + x = neg_inf; + EXPECT_FP_EQ(nan, func(x, y, "ient)); + + x = T(1.0); + y = zero; + EXPECT_FP_EQ(nan, func(x, y, "ient)); + y = neg_zero; + EXPECT_FP_EQ(nan, func(x, y, "ient)); + + y = nan; + x = T(1.0); + EXPECT_FP_EQ(nan, func(x, y, "ient)); + + y = T(1.0); + x = nan; + EXPECT_FP_EQ(nan, func(x, y, "ient)); + + x = nan; + y = nan; + EXPECT_FP_EQ(nan, func(x, y, "ient)); + + x = zero; + y = T(1.0); + EXPECT_FP_EQ(func(x, y, "ient), zero); + + x = neg_zero; + y = T(1.0); + EXPECT_FP_EQ(func(x, y, "ient), neg_zero); + + x = T(1.125); + y = inf; + EXPECT_FP_EQ(func(x, y, "ient), x); + EXPECT_EQ(quotient, 0); + } + + void testEqualNumeratorAndDenominator(RemQuoFunc func) { + T x = T(1.125), y = T(1.125); + int q; + + // When the remainder is zero, the standard requires it to + // have the same sign as x. + + EXPECT_FP_EQ(func(x, y, &q), zero); + EXPECT_EQ(q, 1); + + EXPECT_FP_EQ(func(x, -y, &q), zero); + EXPECT_EQ(q, -1); + + EXPECT_FP_EQ(func(-x, y, &q), neg_zero); + EXPECT_EQ(q, -1); + + EXPECT_FP_EQ(func(-x, -y, &q), neg_zero); + EXPECT_EQ(q, 1); + } +}; + +#define LIST_REMQUO_TESTS(T, func) \ + using LlvmLibcRemQuoTest = RemQuoTestTemplate; \ + TEST_F(LlvmLibcRemQuoTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcRemQuoTest, EqualNumeratorAndDenominator) { \ + testEqualNumeratorAndDenominator(&func); \ + } + +#endif // LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H diff --git a/libc/test/src/math/smoke/RoundTest.h b/libc/test/src/math/smoke/RoundTest.h new file mode 100644 index 0000000000000..c67f1ce48220a --- /dev/null +++ b/libc/test/src/math/smoke/RoundTest.h @@ -0,0 +1,68 @@ +//===-- Utility class to test round[f|l] ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class RoundTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*RoundFunc)(T); + + void testSpecialNumbers(RoundFunc func) { + EXPECT_FP_EQ(zero, func(zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero)); + + EXPECT_FP_EQ(inf, func(inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf)); + + EXPECT_FP_EQ(aNaN, func(aNaN)); + } + + void testRoundedNumbers(RoundFunc func) { + EXPECT_FP_EQ(T(1.0), func(T(1.0))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.0))); + EXPECT_FP_EQ(T(10.0), func(T(10.0))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.0))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.0))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.0))); + } + + void testFractions(RoundFunc func) { + EXPECT_FP_EQ(T(1.0), func(T(0.5))); + EXPECT_FP_EQ(T(-1.0), func(T(-0.5))); + EXPECT_FP_EQ(T(0.0), func(T(0.115))); + EXPECT_FP_EQ(T(-0.0), func(T(-0.115))); + EXPECT_FP_EQ(T(1.0), func(T(0.715))); + EXPECT_FP_EQ(T(-1.0), func(T(-0.715))); + EXPECT_FP_EQ(T(1.0), func(T(1.3))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.3))); + EXPECT_FP_EQ(T(2.0), func(T(1.5))); + EXPECT_FP_EQ(T(-2.0), func(T(-1.5))); + EXPECT_FP_EQ(T(2.0), func(T(1.75))); + EXPECT_FP_EQ(T(-2.0), func(T(-1.75))); + EXPECT_FP_EQ(T(10.0), func(T(10.32))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.32))); + EXPECT_FP_EQ(T(11.0), func(T(10.65))); + EXPECT_FP_EQ(T(-11.0), func(T(-10.65))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.38))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.38))); + EXPECT_FP_EQ(T(1235.0), func(T(1234.96))); + EXPECT_FP_EQ(T(-1235.0), func(T(-1234.96))); + } +}; + +#define LIST_ROUND_TESTS(T, func) \ + using LlvmLibcRoundTest = RoundTest; \ + TEST_F(LlvmLibcRoundTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcRoundTest, RoundedNubmers) { testRoundedNumbers(&func); } \ + TEST_F(LlvmLibcRoundTest, Fractions) { testFractions(&func); } diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h new file mode 100644 index 0000000000000..c37186bff57b3 --- /dev/null +++ b/libc/test/src/math/smoke/RoundToIntegerTest.h @@ -0,0 +1,168 @@ +//===-- Utility class to test different flavors of [l|ll]round --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_ROUNDTOINTEGERTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_ROUNDTOINTEGERTEST_H + +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include +#include + +static constexpr int ROUNDING_MODES[4] = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO, + FE_TONEAREST}; + +template +class RoundToIntegerTestTemplate : public __llvm_libc::testing::Test { +public: + typedef I (*RoundToIntegerFunc)(F); + +private: + using FPBits = __llvm_libc::fputil::FPBits; + using UIntType = typename FPBits::UIntType; + + const F zero = F(__llvm_libc::fputil::FPBits::zero()); + const F neg_zero = F(__llvm_libc::fputil::FPBits::neg_zero()); + const F inf = F(__llvm_libc::fputil::FPBits::inf()); + const F neg_inf = F(__llvm_libc::fputil::FPBits::neg_inf()); + const F nan = F(__llvm_libc::fputil::FPBits::build_quiet_nan(1)); + static constexpr I INTEGER_MIN = I(1) << (sizeof(I) * 8 - 1); + static constexpr I INTEGER_MAX = -(INTEGER_MIN + 1); + + void test_one_input(RoundToIntegerFunc func, F input, I expected, + bool expectError) { + libc_errno = 0; + __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT); + + ASSERT_EQ(func(input), expected); + + if (expectError) { + ASSERT_FP_EXCEPTION(FE_INVALID); + ASSERT_MATH_ERRNO(EDOM); + } else { + ASSERT_FP_EXCEPTION(0); + ASSERT_MATH_ERRNO(0); + } + } + +public: + void SetUp() override { + if (math_errhandling & MATH_ERREXCEPT) { + // We will disable all exceptions so that the test will not + // crash with SIGFPE. We can still use fetestexcept to check + // if the appropriate flag was raised. + __llvm_libc::fputil::disable_except(FE_ALL_EXCEPT); + } + } + + void do_infinity_and_na_n_test(RoundToIntegerFunc func) { + test_one_input(func, inf, INTEGER_MAX, true); + test_one_input(func, neg_inf, INTEGER_MIN, true); + // This is currently never enabled, the + // LLVM_LIBC_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR CMake option in + // libc/CMakeLists.txt is not forwarded to C++. +#if LIBC_COPT_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR + // Result is not well-defined, we always returns INTEGER_MAX + test_one_input(func, nan, INTEGER_MAX, true); +#endif // LIBC_COPT_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR + } + + void testInfinityAndNaN(RoundToIntegerFunc func) { + if (TestModes) { + for (int mode : ROUNDING_MODES) { + __llvm_libc::fputil::set_round(mode); + do_infinity_and_na_n_test(func); + } + } else { + do_infinity_and_na_n_test(func); + } + } + + void do_round_numbers_test(RoundToIntegerFunc func) { + test_one_input(func, zero, I(0), false); + test_one_input(func, neg_zero, I(0), false); + test_one_input(func, F(1.0), I(1), false); + test_one_input(func, F(-1.0), I(-1), false); + test_one_input(func, F(10.0), I(10), false); + test_one_input(func, F(-10.0), I(-10), false); + test_one_input(func, F(1234.0), I(1234), false); + test_one_input(func, F(-1234.0), I(-1234), false); + } + + void testRoundNumbers(RoundToIntegerFunc func) { + if (TestModes) { + for (int mode : ROUNDING_MODES) { + __llvm_libc::fputil::set_round(mode); + do_round_numbers_test(func); + } + } else { + do_round_numbers_test(func); + } + } + + void testSubnormalRange(RoundToIntegerFunc func) { + constexpr UIntType COUNT = 1'000'001; + constexpr UIntType STEP = + (UIntType(FPBits::MAX_SUBNORMAL) - UIntType(FPBits::MIN_SUBNORMAL)) / + COUNT; + for (UIntType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL; + i += STEP) { + F x = F(FPBits(i)); + if (x == F(0.0)) + continue; + // All subnormal numbers should round to zero. + if (TestModes) { + if (x > 0) { + __llvm_libc::fputil::set_round(FE_UPWARD); + test_one_input(func, x, I(1), false); + __llvm_libc::fputil::set_round(FE_DOWNWARD); + test_one_input(func, x, I(0), false); + __llvm_libc::fputil::set_round(FE_TOWARDZERO); + test_one_input(func, x, I(0), false); + __llvm_libc::fputil::set_round(FE_TONEAREST); + test_one_input(func, x, I(0), false); + } else { + __llvm_libc::fputil::set_round(FE_UPWARD); + test_one_input(func, x, I(0), false); + __llvm_libc::fputil::set_round(FE_DOWNWARD); + test_one_input(func, x, I(-1), false); + __llvm_libc::fputil::set_round(FE_TOWARDZERO); + test_one_input(func, x, I(0), false); + __llvm_libc::fputil::set_round(FE_TONEAREST); + test_one_input(func, x, I(0), false); + } + } else { + test_one_input(func, x, 0L, false); + } + } + } +}; + +#define LIST_ROUND_TO_INTEGER_TESTS_HELPER(F, I, func, TestModes) \ + using LlvmLibcRoundToIntegerTest = \ + RoundToIntegerTestTemplate; \ + TEST_F(LlvmLibcRoundToIntegerTest, InfinityAndNaN) { \ + testInfinityAndNaN(&func); \ + } \ + TEST_F(LlvmLibcRoundToIntegerTest, RoundNumbers) { \ + testRoundNumbers(&func); \ + } \ + TEST_F(LlvmLibcRoundToIntegerTest, SubnormalRange) { \ + testSubnormalRange(&func); \ + } + +#define LIST_ROUND_TO_INTEGER_TESTS(F, I, func) \ + LIST_ROUND_TO_INTEGER_TESTS_HELPER(F, I, func, false) + +#define LIST_ROUND_TO_INTEGER_TESTS_WITH_MODES(F, I, func) \ + LIST_ROUND_TO_INTEGER_TESTS_HELPER(F, I, func, true) + +#endif // LLVM_LIBC_TEST_SRC_MATH_ROUNDTOINTEGERTEST_H diff --git a/libc/test/src/math/smoke/ScalbnTest.h b/libc/test/src/math/smoke/ScalbnTest.h new file mode 100644 index 0000000000000..e1d035c4ea3aa --- /dev/null +++ b/libc/test/src/math/smoke/ScalbnTest.h @@ -0,0 +1,28 @@ +//===-- Utility class to test different flavors of scalbn -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_SCALBN_H +#define LLVM_LIBC_TEST_SRC_MATH_SCALBN_H + +#include "LdExpTest.h" +#include "test/UnitTest/Test.h" + +#define LIST_SCALBN_TESTS(T, func) \ + using LlvmLibcScalbnTest = LdExpTestTemplate; \ + TEST_F(LlvmLibcScalbnTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcScalbnTest, PowersOfTwo) { testPowersOfTwo(&func); } \ + TEST_F(LlvmLibcScalbnTest, OverFlow) { testOverflow(&func); } \ + TEST_F(LlvmLibcScalbnTest, UnderflowToZeroOnNormal) { \ + testUnderflowToZeroOnNormal(&func); \ + } \ + TEST_F(LlvmLibcScalbnTest, UnderflowToZeroOnSubnormal) { \ + testUnderflowToZeroOnSubnormal(&func); \ + } \ + TEST_F(LlvmLibcScalbnTest, NormalOperation) { testNormalOperation(&func); } + +#endif // LLVM_LIBC_TEST_SRC_MATH_SCALBN_H diff --git a/libc/test/src/math/smoke/SqrtTest.h b/libc/test/src/math/smoke/SqrtTest.h new file mode 100644 index 0000000000000..c3896e302373a --- /dev/null +++ b/libc/test/src/math/smoke/SqrtTest.h @@ -0,0 +1,40 @@ +//===-- Utility class to test sqrt[f|l] -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/bit.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class SqrtTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + + static constexpr UIntType HIDDEN_BIT = + UIntType(1) << __llvm_libc::fputil::MantissaWidth::VALUE; + +public: + typedef T (*SqrtFunc)(T); + + void test_special_numbers(SqrtFunc func) { + ASSERT_FP_EQ(aNaN, func(aNaN)); + ASSERT_FP_EQ(inf, func(inf)); + ASSERT_FP_EQ(aNaN, func(neg_inf)); + ASSERT_FP_EQ(0.0, func(0.0)); + ASSERT_FP_EQ(-0.0, func(-0.0)); + ASSERT_FP_EQ(aNaN, func(T(-1.0))); + ASSERT_FP_EQ(T(1.0), func(T(1.0))); + ASSERT_FP_EQ(T(2.0), func(T(4.0))); + ASSERT_FP_EQ(T(3.0), func(T(9.0))); + } +}; + +#define LIST_SQRT_TESTS(T, func) \ + using LlvmLibcSqrtTest = SqrtTest; \ + TEST_F(LlvmLibcSqrtTest, SpecialNumbers) { test_special_numbers(&func); } diff --git a/libc/test/src/math/smoke/TruncTest.h b/libc/test/src/math/smoke/TruncTest.h new file mode 100644 index 0000000000000..32b5503589ca7 --- /dev/null +++ b/libc/test/src/math/smoke/TruncTest.h @@ -0,0 +1,68 @@ +//===-- Utility class to test trunc[f|l] ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +#include + +template class TruncTest : public __llvm_libc::testing::Test { + + DECLARE_SPECIAL_CONSTANTS(T) + +public: + typedef T (*TruncFunc)(T); + + void testSpecialNumbers(TruncFunc func) { + EXPECT_FP_EQ(zero, func(zero)); + EXPECT_FP_EQ(neg_zero, func(neg_zero)); + + EXPECT_FP_EQ(inf, func(inf)); + EXPECT_FP_EQ(neg_inf, func(neg_inf)); + + EXPECT_FP_EQ(aNaN, func(aNaN)); + } + + void testRoundedNumbers(TruncFunc func) { + EXPECT_FP_EQ(T(1.0), func(T(1.0))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.0))); + EXPECT_FP_EQ(T(10.0), func(T(10.0))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.0))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.0))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.0))); + } + + void testFractions(TruncFunc func) { + EXPECT_FP_EQ(T(0.0), func(T(0.5))); + EXPECT_FP_EQ(T(-0.0), func(T(-0.5))); + EXPECT_FP_EQ(T(0.0), func(T(0.115))); + EXPECT_FP_EQ(T(-0.0), func(T(-0.115))); + EXPECT_FP_EQ(T(0.0), func(T(0.715))); + EXPECT_FP_EQ(T(-0.0), func(T(-0.715))); + EXPECT_FP_EQ(T(1.0), func(T(1.3))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.3))); + EXPECT_FP_EQ(T(1.0), func(T(1.5))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.5))); + EXPECT_FP_EQ(T(1.0), func(T(1.75))); + EXPECT_FP_EQ(T(-1.0), func(T(-1.75))); + EXPECT_FP_EQ(T(10.0), func(T(10.32))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.32))); + EXPECT_FP_EQ(T(10.0), func(T(10.65))); + EXPECT_FP_EQ(T(-10.0), func(T(-10.65))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.38))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.38))); + EXPECT_FP_EQ(T(1234.0), func(T(1234.96))); + EXPECT_FP_EQ(T(-1234.0), func(T(-1234.96))); + } +}; + +#define LIST_TRUNC_TESTS(T, func) \ + using LlvmLibcTruncTest = TruncTest; \ + TEST_F(LlvmLibcTruncTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + TEST_F(LlvmLibcTruncTest, RoundedNubmers) { testRoundedNumbers(&func); } \ + TEST_F(LlvmLibcTruncTest, Fractions) { testFractions(&func); } diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp new file mode 100644 index 0000000000000..0fc20970232f7 --- /dev/null +++ b/libc/test/src/math/smoke/acosf_test.cpp @@ -0,0 +1,43 @@ +//===-- Unittests for acosf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/acosf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcAcosfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::acosf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::acosf(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::acosf(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::acosf(1.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::acosf(2.0f)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::acosf(-2.0f)); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp new file mode 100644 index 0000000000000..ac3021ff3154e --- /dev/null +++ b/libc/test/src/math/smoke/acoshf_test.cpp @@ -0,0 +1,40 @@ +//===-- Unittests for acoshf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/acoshf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits_t = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcAcoshfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::acoshf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::acoshf(0.0f)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::acoshf(1.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, __llvm_libc::acoshf(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::acoshf(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp new file mode 100644 index 0000000000000..f6455475947a3 --- /dev/null +++ b/libc/test/src/math/smoke/asinf_test.cpp @@ -0,0 +1,46 @@ +//===-- Unittests for asinf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/asinf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcAsinfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::asinf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::asinf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, __llvm_libc::asinf(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::asinf(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::asinf(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::asinf(2.0f)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::asinf(-2.0f)); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp new file mode 100644 index 0000000000000..6b5f9437b4149 --- /dev/null +++ b/libc/test/src/math/smoke/asinhf_test.cpp @@ -0,0 +1,40 @@ +//===-- Unittests for asinhf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/asinhf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits_t = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcAsinhfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::asinhf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::asinhf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, __llvm_libc::asinhf(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, __llvm_libc::asinhf(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, __llvm_libc::asinhf(neg_inf)); + EXPECT_MATH_ERRNO(0); +} diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp new file mode 100644 index 0000000000000..7d3f38682e8c0 --- /dev/null +++ b/libc/test/src/math/smoke/atanf_test.cpp @@ -0,0 +1,40 @@ +//===-- Unittests for atanf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/atanf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcAtanfTest, SpecialNumbers) { + libc_errno = 0; + + __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::atanf(aNaN)); + EXPECT_FP_EXCEPTION(0); + EXPECT_MATH_ERRNO(0); + + __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT); + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::atanf(0.0f)); + EXPECT_FP_EXCEPTION(0); + EXPECT_MATH_ERRNO(0); + + __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT); + EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, __llvm_libc::atanf(-0.0f)); + EXPECT_FP_EXCEPTION(0); + EXPECT_MATH_ERRNO(0); +} diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp new file mode 100644 index 0000000000000..ac2d47babe4eb --- /dev/null +++ b/libc/test/src/math/smoke/atanhf_test.cpp @@ -0,0 +1,72 @@ +//===-- Unittests for atanhf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/atanhf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcAtanhfTest, SpecialNumbers) { + libc_errno = 0; + + __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::atanhf(aNaN)); + EXPECT_FP_EXCEPTION(0); + EXPECT_MATH_ERRNO(0); + + __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT); + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::atanhf(0.0f)); + EXPECT_FP_EXCEPTION(0); + EXPECT_MATH_ERRNO(0); + + __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT); + EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, __llvm_libc::atanhf(-0.0f)); + EXPECT_FP_EXCEPTION(0); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(inf, __llvm_libc::atanhf(1.0f), FE_DIVBYZERO); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::atanhf(-1.0f), + FE_DIVBYZERO); + EXPECT_MATH_ERRNO(ERANGE); + + auto bt = FPBits(1.0f); + bt.bits += 1; + + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::atanhf(bt.get_val()), + FE_INVALID); + EXPECT_MATH_ERRNO(EDOM); + + bt.set_sign(true); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::atanhf(bt.get_val()), + FE_INVALID); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::atanhf(2.0f), FE_INVALID); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::atanhf(-2.0f), FE_INVALID); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::atanhf(inf), FE_INVALID); + EXPECT_MATH_ERRNO(EDOM); + + bt.set_sign(true); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::atanhf(neg_inf), FE_INVALID); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/smoke/ceil_test.cpp b/libc/test/src/math/smoke/ceil_test.cpp new file mode 100644 index 0000000000000..2a592e3ee85b0 --- /dev/null +++ b/libc/test/src/math/smoke/ceil_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for ceil ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CeilTest.h" + +#include "src/math/ceil.h" + +LIST_CEIL_TESTS(double, __llvm_libc::ceil) diff --git a/libc/test/src/math/smoke/ceilf_test.cpp b/libc/test/src/math/smoke/ceilf_test.cpp new file mode 100644 index 0000000000000..cfa8cb2c5aa6d --- /dev/null +++ b/libc/test/src/math/smoke/ceilf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for ceilf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CeilTest.h" + +#include "src/math/ceilf.h" + +LIST_CEIL_TESTS(float, __llvm_libc::ceilf) diff --git a/libc/test/src/math/smoke/ceill_test.cpp b/libc/test/src/math/smoke/ceill_test.cpp new file mode 100644 index 0000000000000..6fe30465edc85 --- /dev/null +++ b/libc/test/src/math/smoke/ceill_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for ceill -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CeilTest.h" + +#include "src/math/ceill.h" + +LIST_CEIL_TESTS(long double, __llvm_libc::ceill) diff --git a/libc/test/src/math/smoke/copysign_test.cpp b/libc/test/src/math/smoke/copysign_test.cpp new file mode 100644 index 0000000000000..37bfa0d90c378 --- /dev/null +++ b/libc/test/src/math/smoke/copysign_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for copysign --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CopySignTest.h" + +#include "src/math/copysign.h" + +LIST_COPYSIGN_TESTS(double, __llvm_libc::copysign) diff --git a/libc/test/src/math/smoke/copysignf_test.cpp b/libc/test/src/math/smoke/copysignf_test.cpp new file mode 100644 index 0000000000000..fec283ccd0c7a --- /dev/null +++ b/libc/test/src/math/smoke/copysignf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for copysignf -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CopySignTest.h" + +#include "src/math/copysignf.h" + +LIST_COPYSIGN_TESTS(float, __llvm_libc::copysignf) diff --git a/libc/test/src/math/smoke/copysignl_test.cpp b/libc/test/src/math/smoke/copysignl_test.cpp new file mode 100644 index 0000000000000..d1de911196b62 --- /dev/null +++ b/libc/test/src/math/smoke/copysignl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for copysignl -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CopySignTest.h" + +#include "src/math/copysignl.h" + +LIST_COPYSIGN_TESTS(long double, __llvm_libc::copysignl) diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp new file mode 100644 index 0000000000000..4ef96677eb4d1 --- /dev/null +++ b/libc/test/src/math/smoke/cosf_test.cpp @@ -0,0 +1,40 @@ +//===-- Unittests for cosf ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/cosf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcCosfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::cosf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::cosf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::cosf(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::cosf(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::cosf(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp new file mode 100644 index 0000000000000..7198aa266aa8d --- /dev/null +++ b/libc/test/src/math/smoke/coshf_test.cpp @@ -0,0 +1,56 @@ +//===-- Unittests for coshf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/array.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/coshf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcCoshfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::coshf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, __llvm_libc::coshf(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, __llvm_libc::coshf(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::coshf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::coshf(-0.0f)); + EXPECT_MATH_ERRNO(0); +} + +TEST(LlvmLibcCoshfTest, Overflow) { + libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::coshf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::coshf(float(FPBits(0x42cffff8U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::coshf(float(FPBits(0x42d00008U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); +} diff --git a/libc/test/src/math/smoke/erff_test.cpp b/libc/test/src/math/smoke/erff_test.cpp new file mode 100644 index 0000000000000..c2a247b6e6672 --- /dev/null +++ b/libc/test/src/math/smoke/erff_test.cpp @@ -0,0 +1,28 @@ +//===-- Unittests for erff ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/math/erff.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using __llvm_libc::testing::tlog; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcErffTest, SpecialNumbers) { + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::erff(aNaN)); + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::erff(inf)); + EXPECT_FP_EQ_ALL_ROUNDING(-1.0f, __llvm_libc::erff(neg_inf)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::erff(zero)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, __llvm_libc::erff(neg_zero)); +} diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp new file mode 100644 index 0000000000000..21ba51d32345a --- /dev/null +++ b/libc/test/src/math/smoke/exp10_test.cpp @@ -0,0 +1,36 @@ +//===-- Unittests for 10^x ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/exp10.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using __llvm_libc::testing::tlog; + +DECLARE_SPECIAL_CONSTANTS(double) + +TEST(LlvmLibcExp10Test, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::exp10(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::exp10(inf)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::exp10(neg_inf)); + EXPECT_FP_EQ_WITH_EXCEPTION(zero, __llvm_libc::exp10(-0x1.0p20), + FE_UNDERFLOW); + EXPECT_FP_EQ_WITH_EXCEPTION(inf, __llvm_libc::exp10(0x1.0p20), FE_OVERFLOW); + EXPECT_FP_EQ_ALL_ROUNDING(1.0, __llvm_libc::exp10(0.0)); + EXPECT_FP_EQ_ALL_ROUNDING(1.0, __llvm_libc::exp10(-0.0)); + + EXPECT_FP_EQ_ALL_ROUNDING(10.0, __llvm_libc::exp10(1.0)); + EXPECT_FP_EQ_ALL_ROUNDING(100.0, __llvm_libc::exp10(2.0)); + EXPECT_FP_EQ_ALL_ROUNDING(1000.0, __llvm_libc::exp10(3.0)); +} diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp new file mode 100644 index 0000000000000..4de0074ab8269 --- /dev/null +++ b/libc/test/src/math/smoke/exp10f_test.cpp @@ -0,0 +1,56 @@ +//===-- Unittests for exp10f ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/exp10f.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcExp10fTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::exp10f(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, __llvm_libc::exp10f(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::exp10f(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::exp10f(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::exp10f(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(10.0f, __llvm_libc::exp10f(1.0f)); + EXPECT_FP_EQ_ALL_ROUNDING(100.0f, __llvm_libc::exp10f(2.0f)); + EXPECT_FP_EQ_ALL_ROUNDING(1000.0f, __llvm_libc::exp10f(3.0f)); +} + +TEST(LlvmLibcExp10fTest, Overflow) { + libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::exp10f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::exp10f(float(FPBits(0x43000000U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::exp10f(float(FPBits(0x43000001U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); +} diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp new file mode 100644 index 0000000000000..13b24b0126109 --- /dev/null +++ b/libc/test/src/math/smoke/exp2_test.cpp @@ -0,0 +1,35 @@ +//===-- Unittests for 2^x -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/exp2.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using __llvm_libc::testing::tlog; + +DECLARE_SPECIAL_CONSTANTS(double) + +TEST(LlvmLibcExp2Test, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::exp2(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::exp2(inf)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::exp2(neg_inf)); + EXPECT_FP_EQ_WITH_EXCEPTION(zero, __llvm_libc::exp2(-0x1.0p20), FE_UNDERFLOW); + EXPECT_FP_EQ_WITH_EXCEPTION(inf, __llvm_libc::exp2(0x1.0p20), FE_OVERFLOW); + EXPECT_FP_EQ_ALL_ROUNDING(1.0, __llvm_libc::exp2(0.0)); + EXPECT_FP_EQ_ALL_ROUNDING(1.0, __llvm_libc::exp2(-0.0)); + EXPECT_FP_EQ_ALL_ROUNDING(2.0, __llvm_libc::exp2(1.0)); + EXPECT_FP_EQ_ALL_ROUNDING(0.5, __llvm_libc::exp2(-1.0)); + EXPECT_FP_EQ_ALL_ROUNDING(4.0, __llvm_libc::exp2(2.0)); + EXPECT_FP_EQ_ALL_ROUNDING(0.25, __llvm_libc::exp2(-2.0)); +} diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp new file mode 100644 index 0000000000000..c22a5eb294100 --- /dev/null +++ b/libc/test/src/math/smoke/exp2f_test.cpp @@ -0,0 +1,58 @@ +//===-- Unittests for exp2f -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA +#include "src/errno/libc_errno.h" +#include "src/math/exp2f.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcExp2fTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::exp2f(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, __llvm_libc::exp2f(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::exp2f(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::exp2f(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::exp2f(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(2.0f, __llvm_libc::exp2f(1.0f)); + EXPECT_FP_EQ_ALL_ROUNDING(0.5f, __llvm_libc::exp2f(-1.0f)); + EXPECT_FP_EQ_ALL_ROUNDING(4.0f, __llvm_libc::exp2f(2.0f)); + EXPECT_FP_EQ_ALL_ROUNDING(0.25f, __llvm_libc::exp2f(-2.0f)); +} + +TEST(LlvmLibcExp2fTest, Overflow) { + libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::exp2f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::exp2f(float(FPBits(0x43000000U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::exp2f(float(FPBits(0x43000001U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); +} diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp new file mode 100644 index 0000000000000..3406f0afd73eb --- /dev/null +++ b/libc/test/src/math/smoke/exp_test.cpp @@ -0,0 +1,31 @@ +//===-- Unittests for exp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/exp.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using __llvm_libc::testing::tlog; + +DECLARE_SPECIAL_CONSTANTS(double) + +TEST(LlvmLibcExpTest, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::exp(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::exp(inf)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::exp(neg_inf)); + EXPECT_FP_EQ_WITH_EXCEPTION(zero, __llvm_libc::exp(-0x1.0p20), FE_UNDERFLOW); + EXPECT_FP_EQ_WITH_EXCEPTION(inf, __llvm_libc::exp(0x1.0p20), FE_OVERFLOW); + EXPECT_FP_EQ_ALL_ROUNDING(1.0, __llvm_libc::exp(0.0)); + EXPECT_FP_EQ_ALL_ROUNDING(1.0, __llvm_libc::exp(-0.0)); +} diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp new file mode 100644 index 0000000000000..174d58013cc06 --- /dev/null +++ b/libc/test/src/math/smoke/expf_test.cpp @@ -0,0 +1,52 @@ +//===-- Unittests for expf ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/expf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcExpfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::expf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, __llvm_libc::expf(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::expf(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::expf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(1.0f, __llvm_libc::expf(-0.0f)); + EXPECT_MATH_ERRNO(0); +} + +TEST(LlvmLibcExpfTest, Overflow) { + libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::expf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::expf(float(FPBits(0x42cffff8U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::expf(float(FPBits(0x42d00008U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); +} diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp new file mode 100644 index 0000000000000..f34562434eb34 --- /dev/null +++ b/libc/test/src/math/smoke/expm1f_test.cpp @@ -0,0 +1,52 @@ +//===-- Unittests for expm1f-----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/expm1f.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcExpm1fTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, __llvm_libc::expm1f(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(inf, __llvm_libc::expm1f(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(-1.0f, __llvm_libc::expm1f(neg_inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(0.0f, __llvm_libc::expm1f(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_ALL_ROUNDING(-0.0f, __llvm_libc::expm1f(-0.0f)); + EXPECT_MATH_ERRNO(0); +} + +TEST(LlvmLibcExpm1fTest, Overflow) { + libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::expm1f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::expm1f(float(FPBits(0x42cffff8U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::expm1f(float(FPBits(0x42d00008U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); +} diff --git a/libc/test/src/math/smoke/fabs_test.cpp b/libc/test/src/math/smoke/fabs_test.cpp new file mode 100644 index 0000000000000..2bf0ddda44ade --- /dev/null +++ b/libc/test/src/math/smoke/fabs_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fabs ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FAbsTest.h" + +#include "src/math/fabs.h" + +LIST_FABS_TESTS(double, __llvm_libc::fabs) diff --git a/libc/test/src/math/smoke/fabsf_test.cpp b/libc/test/src/math/smoke/fabsf_test.cpp new file mode 100644 index 0000000000000..6374a1018f1da --- /dev/null +++ b/libc/test/src/math/smoke/fabsf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fabsf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FAbsTest.h" + +#include "src/math/fabsf.h" + +LIST_FABS_TESTS(float, __llvm_libc::fabsf) diff --git a/libc/test/src/math/smoke/fabsl_test.cpp b/libc/test/src/math/smoke/fabsl_test.cpp new file mode 100644 index 0000000000000..ff1320225452a --- /dev/null +++ b/libc/test/src/math/smoke/fabsl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fabsl -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FAbsTest.h" + +#include "src/math/fabsl.h" + +LIST_FABS_TESTS(long double, __llvm_libc::fabsl) diff --git a/libc/test/src/math/smoke/fdim_test.cpp b/libc/test/src/math/smoke/fdim_test.cpp new file mode 100644 index 0000000000000..80174b79b95b8 --- /dev/null +++ b/libc/test/src/math/smoke/fdim_test.cpp @@ -0,0 +1,31 @@ +//===-- Unittests for fdim ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FDimTest.h" + +#include "src/__support/FPUtil/FPBits.h" +#include "src/math/fdim.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +using LlvmLibcFDimTest = FDimTestTemplate; + +TEST_F(LlvmLibcFDimTest, NaNArg_fdim) { test_na_n_arg(&__llvm_libc::fdim); } + +TEST_F(LlvmLibcFDimTest, InfArg_fdim) { test_inf_arg(&__llvm_libc::fdim); } + +TEST_F(LlvmLibcFDimTest, NegInfArg_fdim) { + test_neg_inf_arg(&__llvm_libc::fdim); +} + +TEST_F(LlvmLibcFDimTest, BothZero_fdim) { test_both_zero(&__llvm_libc::fdim); } + +TEST_F(LlvmLibcFDimTest, InDoubleRange_fdim) { + test_in_range(&__llvm_libc::fdim); +} diff --git a/libc/test/src/math/smoke/fdimf_test.cpp b/libc/test/src/math/smoke/fdimf_test.cpp new file mode 100644 index 0000000000000..531b89b81ffc9 --- /dev/null +++ b/libc/test/src/math/smoke/fdimf_test.cpp @@ -0,0 +1,33 @@ +//===-- Unittests for fdimf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FDimTest.h" + +#include "src/__support/FPUtil/FPBits.h" +#include "src/math/fdimf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +using LlvmLibcFDimTest = FDimTestTemplate; + +TEST_F(LlvmLibcFDimTest, NaNArg_fdimf) { test_na_n_arg(&__llvm_libc::fdimf); } + +TEST_F(LlvmLibcFDimTest, InfArg_fdimf) { test_inf_arg(&__llvm_libc::fdimf); } + +TEST_F(LlvmLibcFDimTest, NegInfArg_fdimf) { + test_neg_inf_arg(&__llvm_libc::fdimf); +} + +TEST_F(LlvmLibcFDimTest, BothZero_fdimf) { + test_both_zero(&__llvm_libc::fdimf); +} + +TEST_F(LlvmLibcFDimTest, InFloatRange_fdimf) { + test_in_range(&__llvm_libc::fdimf); +} diff --git a/libc/test/src/math/smoke/fdiml_test.cpp b/libc/test/src/math/smoke/fdiml_test.cpp new file mode 100644 index 0000000000000..bb6c54d567a0b --- /dev/null +++ b/libc/test/src/math/smoke/fdiml_test.cpp @@ -0,0 +1,33 @@ +//===-- Unittests for fdiml -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FDimTest.h" + +#include "src/__support/FPUtil/FPBits.h" +#include "src/math/fdiml.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +using LlvmLibcFDimTest = FDimTestTemplate; + +TEST_F(LlvmLibcFDimTest, NaNArg_fdiml) { test_na_n_arg(&__llvm_libc::fdiml); } + +TEST_F(LlvmLibcFDimTest, InfArg_fdiml) { test_inf_arg(&__llvm_libc::fdiml); } + +TEST_F(LlvmLibcFDimTest, NegInfArg_fdiml) { + test_neg_inf_arg(&__llvm_libc::fdiml); +} + +TEST_F(LlvmLibcFDimTest, BothZero_fdiml) { + test_both_zero(&__llvm_libc::fdiml); +} + +TEST_F(LlvmLibcFDimTest, InLongDoubleRange_fdiml) { + test_in_range(&__llvm_libc::fdiml); +} diff --git a/libc/test/src/math/smoke/floor_test.cpp b/libc/test/src/math/smoke/floor_test.cpp new file mode 100644 index 0000000000000..e68ed3ecbb603 --- /dev/null +++ b/libc/test/src/math/smoke/floor_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for floor -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FloorTest.h" + +#include "src/math/floor.h" + +LIST_FLOOR_TESTS(double, __llvm_libc::floor) diff --git a/libc/test/src/math/smoke/floorf_test.cpp b/libc/test/src/math/smoke/floorf_test.cpp new file mode 100644 index 0000000000000..f0538f7b97b22 --- /dev/null +++ b/libc/test/src/math/smoke/floorf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for floorf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FloorTest.h" + +#include "src/math/floorf.h" + +LIST_FLOOR_TESTS(float, __llvm_libc::floorf) diff --git a/libc/test/src/math/smoke/floorl_test.cpp b/libc/test/src/math/smoke/floorl_test.cpp new file mode 100644 index 0000000000000..1b244fd62a117 --- /dev/null +++ b/libc/test/src/math/smoke/floorl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for floorl ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FloorTest.h" + +#include "src/math/floorl.h" + +LIST_FLOOR_TESTS(long double, __llvm_libc::floorl) diff --git a/libc/test/src/math/smoke/fma_test.cpp b/libc/test/src/math/smoke/fma_test.cpp new file mode 100644 index 0000000000000..f33860f65236b --- /dev/null +++ b/libc/test/src/math/smoke/fma_test.cpp @@ -0,0 +1,17 @@ +//===-- Unittests for fma ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FmaTest.h" + +#include "src/math/fma.h" + +using LlvmLibcFmaTest = FmaTestTemplate; + +TEST_F(LlvmLibcFmaTest, SpecialNumbers) { + test_special_numbers(&__llvm_libc::fma); +} diff --git a/libc/test/src/math/smoke/fmaf_test.cpp b/libc/test/src/math/smoke/fmaf_test.cpp new file mode 100644 index 0000000000000..38c2ea35db056 --- /dev/null +++ b/libc/test/src/math/smoke/fmaf_test.cpp @@ -0,0 +1,17 @@ +//===-- Unittests for fmaf ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FmaTest.h" + +#include "src/math/fmaf.h" + +using LlvmLibcFmafTest = FmaTestTemplate; + +TEST_F(LlvmLibcFmafTest, SpecialNumbers) { + test_special_numbers(&__llvm_libc::fmaf); +} diff --git a/libc/test/src/math/smoke/fmax_test.cpp b/libc/test/src/math/smoke/fmax_test.cpp new file mode 100644 index 0000000000000..9c81c09414f01 --- /dev/null +++ b/libc/test/src/math/smoke/fmax_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmax -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "FMaxTest.h" + +#include "src/math/fmax.h" + +LIST_FMAX_TESTS(double, __llvm_libc::fmax) diff --git a/libc/test/src/math/smoke/fmaxf_test.cpp b/libc/test/src/math/smoke/fmaxf_test.cpp new file mode 100644 index 0000000000000..cf337d4f07e03 --- /dev/null +++ b/libc/test/src/math/smoke/fmaxf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmaxf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMaxTest.h" + +#include "src/math/fmaxf.h" + +LIST_FMAX_TESTS(float, __llvm_libc::fmaxf) diff --git a/libc/test/src/math/smoke/fmaxl_test.cpp b/libc/test/src/math/smoke/fmaxl_test.cpp new file mode 100644 index 0000000000000..636bc4208caef --- /dev/null +++ b/libc/test/src/math/smoke/fmaxl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmaxl -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMaxTest.h" + +#include "src/math/fmaxl.h" + +LIST_FMAX_TESTS(long double, __llvm_libc::fmaxl) diff --git a/libc/test/src/math/smoke/fmin_test.cpp b/libc/test/src/math/smoke/fmin_test.cpp new file mode 100644 index 0000000000000..3515f97faca33 --- /dev/null +++ b/libc/test/src/math/smoke/fmin_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmin -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "FMinTest.h" + +#include "src/math/fmin.h" + +LIST_FMIN_TESTS(double, __llvm_libc::fmin) diff --git a/libc/test/src/math/smoke/fminf_test.cpp b/libc/test/src/math/smoke/fminf_test.cpp new file mode 100644 index 0000000000000..1a762478af613 --- /dev/null +++ b/libc/test/src/math/smoke/fminf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fminf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMinTest.h" + +#include "src/math/fminf.h" + +LIST_FMIN_TESTS(float, __llvm_libc::fminf) diff --git a/libc/test/src/math/smoke/fminl_test.cpp b/libc/test/src/math/smoke/fminl_test.cpp new file mode 100644 index 0000000000000..ce8b705b0385e --- /dev/null +++ b/libc/test/src/math/smoke/fminl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fminl -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMinTest.h" + +#include "src/math/fminl.h" + +LIST_FMIN_TESTS(long double, __llvm_libc::fminl) diff --git a/libc/test/src/math/smoke/fmod_test.cpp b/libc/test/src/math/smoke/fmod_test.cpp new file mode 100644 index 0000000000000..03790e4941e19 --- /dev/null +++ b/libc/test/src/math/smoke/fmod_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmod ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FModTest.h" + +#include "src/math/fmod.h" + +LIST_FMOD_TESTS(double, __llvm_libc::fmod) diff --git a/libc/test/src/math/smoke/fmodf_test.cpp b/libc/test/src/math/smoke/fmodf_test.cpp new file mode 100644 index 0000000000000..2b13379eba252 --- /dev/null +++ b/libc/test/src/math/smoke/fmodf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmodf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FModTest.h" + +#include "src/math/fmodf.h" + +LIST_FMOD_TESTS(float, __llvm_libc::fmodf) diff --git a/libc/test/src/math/smoke/frexp_test.cpp b/libc/test/src/math/smoke/frexp_test.cpp new file mode 100644 index 0000000000000..a44aefc7d8b4e --- /dev/null +++ b/libc/test/src/math/smoke/frexp_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for frexp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FrexpTest.h" + +#include "src/math/frexp.h" + +LIST_FREXP_TESTS(double, __llvm_libc::frexp) diff --git a/libc/test/src/math/smoke/frexpf_test.cpp b/libc/test/src/math/smoke/frexpf_test.cpp new file mode 100644 index 0000000000000..5d78f94ec1e59 --- /dev/null +++ b/libc/test/src/math/smoke/frexpf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for frexpf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FrexpTest.h" + +#include "src/math/frexpf.h" + +LIST_FREXP_TESTS(float, __llvm_libc::frexpf) diff --git a/libc/test/src/math/smoke/frexpl_test.cpp b/libc/test/src/math/smoke/frexpl_test.cpp new file mode 100644 index 0000000000000..4904ccd14907b --- /dev/null +++ b/libc/test/src/math/smoke/frexpl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for frexpl ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FrexpTest.h" + +#include "src/math/frexpl.h" + +LIST_FREXP_TESTS(long double, __llvm_libc::frexpl) diff --git a/libc/test/src/math/smoke/generic_sqrt_test.cpp b/libc/test/src/math/smoke/generic_sqrt_test.cpp new file mode 100644 index 0000000000000..cecfc0ee3de3a --- /dev/null +++ b/libc/test/src/math/smoke/generic_sqrt_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for generic implementation of sqrt ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SqrtTest.h" + +#include "src/__support/FPUtil/generic/sqrt.h" + +LIST_SQRT_TESTS(double, __llvm_libc::fputil::sqrt) diff --git a/libc/test/src/math/smoke/generic_sqrtf_test.cpp b/libc/test/src/math/smoke/generic_sqrtf_test.cpp new file mode 100644 index 0000000000000..64bf92133b98f --- /dev/null +++ b/libc/test/src/math/smoke/generic_sqrtf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for generic implementation of sqrtf----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SqrtTest.h" + +#include "src/__support/FPUtil/generic/sqrt.h" + +LIST_SQRT_TESTS(float, __llvm_libc::fputil::sqrt) diff --git a/libc/test/src/math/smoke/generic_sqrtl_test.cpp b/libc/test/src/math/smoke/generic_sqrtl_test.cpp new file mode 100644 index 0000000000000..6b68aaed97004 --- /dev/null +++ b/libc/test/src/math/smoke/generic_sqrtl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for generic implementation of sqrtl----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SqrtTest.h" + +#include "src/__support/FPUtil/generic/sqrt.h" + +LIST_SQRT_TESTS(long double, __llvm_libc::fputil::sqrt) diff --git a/libc/test/src/math/smoke/hypot_test.cpp b/libc/test/src/math/smoke/hypot_test.cpp new file mode 100644 index 0000000000000..f865b9f903e35 --- /dev/null +++ b/libc/test/src/math/smoke/hypot_test.cpp @@ -0,0 +1,17 @@ +//===-- Unittests for hypot -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "HypotTest.h" + +#include "src/math/hypot.h" + +using LlvmLibcHypotTest = HypotTestTemplate; + +TEST_F(LlvmLibcHypotTest, SpecialNumbers) { + test_special_numbers(&__llvm_libc::hypot); +} diff --git a/libc/test/src/math/smoke/hypotf_test.cpp b/libc/test/src/math/smoke/hypotf_test.cpp new file mode 100644 index 0000000000000..0c0ee3127cd44 --- /dev/null +++ b/libc/test/src/math/smoke/hypotf_test.cpp @@ -0,0 +1,17 @@ +//===-- Unittests for hypotf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "HypotTest.h" + +#include "src/math/hypotf.h" + +using LlvmLibcHypotfTest = HypotTestTemplate; + +TEST_F(LlvmLibcHypotfTest, SpecialNumbers) { + test_special_numbers(&__llvm_libc::hypotf); +} diff --git a/libc/test/src/math/smoke/ilogb_test.cpp b/libc/test/src/math/smoke/ilogb_test.cpp new file mode 100644 index 0000000000000..a77adec4adba4 --- /dev/null +++ b/libc/test/src/math/smoke/ilogb_test.cpp @@ -0,0 +1,36 @@ +//===-- Unittests for ilogb -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ILogbTest.h" + +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "src/math/ilogb.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogb) { + test_special_numbers(&__llvm_libc::ilogb); +} + +TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogb) { + test_powers_of_two(&__llvm_libc::ilogb); +} + +TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogb) { + test_some_integers(&__llvm_libc::ilogb); +} + +TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogb) { + test_subnormal_range(&__llvm_libc::ilogb); +} + +TEST_F(LlvmLibcILogbTest, NormalRange_ilogb) { + test_normal_range(&__llvm_libc::ilogb); +} diff --git a/libc/test/src/math/smoke/ilogbf_test.cpp b/libc/test/src/math/smoke/ilogbf_test.cpp new file mode 100644 index 0000000000000..20904a13c98a8 --- /dev/null +++ b/libc/test/src/math/smoke/ilogbf_test.cpp @@ -0,0 +1,36 @@ +//===-- Unittests for ilogbf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ILogbTest.h" + +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "src/math/ilogbf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbf) { + test_special_numbers(&__llvm_libc::ilogbf); +} + +TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogbf) { + test_powers_of_two(&__llvm_libc::ilogbf); +} + +TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogbf) { + test_some_integers(&__llvm_libc::ilogbf); +} + +TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogbf) { + test_subnormal_range(&__llvm_libc::ilogbf); +} + +TEST_F(LlvmLibcILogbTest, NormalRange_ilogbf) { + test_normal_range(&__llvm_libc::ilogbf); +} diff --git a/libc/test/src/math/smoke/ilogbl_test.cpp b/libc/test/src/math/smoke/ilogbl_test.cpp new file mode 100644 index 0000000000000..9647f232245dc --- /dev/null +++ b/libc/test/src/math/smoke/ilogbl_test.cpp @@ -0,0 +1,36 @@ +//===-- Unittests for ilogbl ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ILogbTest.h" + +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "src/math/ilogbl.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbl) { + test_special_numbers(&__llvm_libc::ilogbl); +} + +TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogbl) { + test_powers_of_two(&__llvm_libc::ilogbl); +} + +TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogbl) { + test_some_integers(&__llvm_libc::ilogbl); +} + +TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogbl) { + test_subnormal_range(&__llvm_libc::ilogbl); +} + +TEST_F(LlvmLibcILogbTest, NormalRange_ilogbl) { + test_normal_range(&__llvm_libc::ilogbl); +} diff --git a/libc/test/src/math/smoke/ldexp_test.cpp b/libc/test/src/math/smoke/ldexp_test.cpp new file mode 100644 index 0000000000000..c078aa4b7ef84 --- /dev/null +++ b/libc/test/src/math/smoke/ldexp_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for ldexp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LdExpTest.h" + +#include "src/math/ldexp.h" + +LIST_LDEXP_TESTS(double, __llvm_libc::ldexp) diff --git a/libc/test/src/math/smoke/ldexpf_test.cpp b/libc/test/src/math/smoke/ldexpf_test.cpp new file mode 100644 index 0000000000000..4fe80fc79151c --- /dev/null +++ b/libc/test/src/math/smoke/ldexpf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for ldexpf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LdExpTest.h" + +#include "src/math/ldexpf.h" + +LIST_LDEXP_TESTS(float, __llvm_libc::ldexpf) diff --git a/libc/test/src/math/smoke/ldexpl_test.cpp b/libc/test/src/math/smoke/ldexpl_test.cpp new file mode 100644 index 0000000000000..5e6e6e75ae085 --- /dev/null +++ b/libc/test/src/math/smoke/ldexpl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for ldexpl ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LdExpTest.h" + +#include "src/math/ldexpl.h" + +LIST_LDEXP_TESTS(long double, __llvm_libc::ldexpl) diff --git a/libc/test/src/math/smoke/llrint_test.cpp b/libc/test/src/math/smoke/llrint_test.cpp new file mode 100644 index 0000000000000..8027004bd50a5 --- /dev/null +++ b/libc/test/src/math/smoke/llrint_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for llrint ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/llrint.h" + +LIST_ROUND_TO_INTEGER_TESTS_WITH_MODES(double, long long, __llvm_libc::llrint) diff --git a/libc/test/src/math/smoke/llrintf_test.cpp b/libc/test/src/math/smoke/llrintf_test.cpp new file mode 100644 index 0000000000000..9ad710e469835 --- /dev/null +++ b/libc/test/src/math/smoke/llrintf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for llrintf ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/llrintf.h" + +LIST_ROUND_TO_INTEGER_TESTS_WITH_MODES(float, long long, __llvm_libc::llrintf) diff --git a/libc/test/src/math/smoke/llrintl_test.cpp b/libc/test/src/math/smoke/llrintl_test.cpp new file mode 100644 index 0000000000000..948e961e57bd4 --- /dev/null +++ b/libc/test/src/math/smoke/llrintl_test.cpp @@ -0,0 +1,14 @@ +//===-- Unittests for llrintl ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/llrintl.h" + +LIST_ROUND_TO_INTEGER_TESTS_WITH_MODES(long double, long long, + __llvm_libc::llrintl) diff --git a/libc/test/src/math/smoke/llround_test.cpp b/libc/test/src/math/smoke/llround_test.cpp new file mode 100644 index 0000000000000..c86095dc9aae0 --- /dev/null +++ b/libc/test/src/math/smoke/llround_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for llround ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/llround.h" + +LIST_ROUND_TO_INTEGER_TESTS(double, long long, __llvm_libc::llround) diff --git a/libc/test/src/math/smoke/llroundf_test.cpp b/libc/test/src/math/smoke/llroundf_test.cpp new file mode 100644 index 0000000000000..5e36258c246ae --- /dev/null +++ b/libc/test/src/math/smoke/llroundf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for llroundf --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/llroundf.h" + +LIST_ROUND_TO_INTEGER_TESTS(float, long long, __llvm_libc::llroundf) diff --git a/libc/test/src/math/smoke/llroundl_test.cpp b/libc/test/src/math/smoke/llroundl_test.cpp new file mode 100644 index 0000000000000..8fbc840c4f2ac --- /dev/null +++ b/libc/test/src/math/smoke/llroundl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for llroundl --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/llroundl.h" + +LIST_ROUND_TO_INTEGER_TESTS(long double, long long, __llvm_libc::llroundl) diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp new file mode 100644 index 0000000000000..8a214748fb2e0 --- /dev/null +++ b/libc/test/src/math/smoke/log10_test.cpp @@ -0,0 +1,36 @@ +//===-- Unittests for log10 -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/log10.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using __llvm_libc::testing::tlog; + +DECLARE_SPECIAL_CONSTANTS(double) + +TEST(LlvmLibcLog10Test, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::log10(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::log10(inf)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log10(neg_inf), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log10(0.0), FE_DIVBYZERO); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log10(-0.0), FE_DIVBYZERO); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log10(-1.0), FE_INVALID); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::log10(1.0)); + + double x = 1.0; + for (int i = 0; i < 11; ++i, x *= 10.0) { + EXPECT_FP_EQ_ALL_ROUNDING(static_cast(i), __llvm_libc::log10(x)); + } +} diff --git a/libc/test/src/math/smoke/log10f_test.cpp b/libc/test/src/math/smoke/log10f_test.cpp new file mode 100644 index 0000000000000..d7838140bec27 --- /dev/null +++ b/libc/test/src/math/smoke/log10f_test.cpp @@ -0,0 +1,34 @@ +//===-- Unittests for log10f ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/math/log10f.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcLog10fTest, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::log10f(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::log10f(inf)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log10f(neg_inf), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log10f(0.0f), FE_DIVBYZERO); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log10f(-0.0f), + FE_DIVBYZERO); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log10f(-1.0f), FE_INVALID); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::log10f(1.0f)); + + float x = 1.0f; + for (int i = 0; i < 11; ++i, x *= 10.0f) { + EXPECT_FP_EQ_ALL_ROUNDING(static_cast(i), __llvm_libc::log10f(x)); + } +} diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp new file mode 100644 index 0000000000000..14e1977e060e7 --- /dev/null +++ b/libc/test/src/math/smoke/log1p_test.cpp @@ -0,0 +1,31 @@ +//===-- Unittests for log1p -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/log1p.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using __llvm_libc::testing::tlog; + +DECLARE_SPECIAL_CONSTANTS(double) + +TEST(LlvmLibcLog1pTest, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::log1p(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::log1p(inf)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log1p(neg_inf), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log1p(-2.0), FE_INVALID); + EXPECT_FP_EQ(zero, __llvm_libc::log1p(0.0)); + EXPECT_FP_EQ(neg_zero, __llvm_libc::log1p(-0.0)); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log1p(-1.0), FE_DIVBYZERO); +} diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp new file mode 100644 index 0000000000000..22043ba6e278d --- /dev/null +++ b/libc/test/src/math/smoke/log1pf_test.cpp @@ -0,0 +1,29 @@ +//===-- Unittests for log1pf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/log1pf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibclog1pfTest, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::log1pf(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::log1pf(inf)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log1pf(neg_inf), FE_INVALID); + EXPECT_FP_EQ(zero, __llvm_libc::log1pf(0.0f)); + EXPECT_FP_EQ(neg_zero, __llvm_libc::log1pf(-0.0f)); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log1pf(-1.0f), + FE_DIVBYZERO); +} diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp new file mode 100644 index 0000000000000..2726a6b1c3a72 --- /dev/null +++ b/libc/test/src/math/smoke/log2_test.cpp @@ -0,0 +1,31 @@ +//===-- Unittests for log2 ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/log2.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using __llvm_libc::testing::tlog; + +DECLARE_SPECIAL_CONSTANTS(double) + +TEST(LlvmLibcLog2Test, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::log2(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::log2(inf)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log2(neg_inf), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log2(0.0), FE_DIVBYZERO); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log2(-0.0), FE_DIVBYZERO); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log2(-1.0), FE_INVALID); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::log2(1.0)); +} diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp new file mode 100644 index 0000000000000..6b05e6cc715eb --- /dev/null +++ b/libc/test/src/math/smoke/log2f_test.cpp @@ -0,0 +1,28 @@ +//===-- Unittests for log2f -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/log2f.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcLog2fTest, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::log2f(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::log2f(inf)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log2f(neg_inf), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log2f(0.0f), FE_DIVBYZERO); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log2f(-0.0f), FE_DIVBYZERO); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log2f(-1.0f), FE_INVALID); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::log2f(1.0f)); +} diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp new file mode 100644 index 0000000000000..4277123214a67 --- /dev/null +++ b/libc/test/src/math/smoke/log_test.cpp @@ -0,0 +1,31 @@ +//===-- Unittests for log -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/log.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using __llvm_libc::testing::tlog; + +DECLARE_SPECIAL_CONSTANTS(double) + +TEST(LlvmLibcLogTest, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::log(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::log(inf)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log(neg_inf), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log(0.0), FE_DIVBYZERO); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::log(-0.0), FE_DIVBYZERO); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::log(-1.0), FE_INVALID); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::log(1.0)); +} diff --git a/libc/test/src/math/smoke/logb_test.cpp b/libc/test/src/math/smoke/logb_test.cpp new file mode 100644 index 0000000000000..b90348522aa3a --- /dev/null +++ b/libc/test/src/math/smoke/logb_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for logb ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LogbTest.h" + +#include "src/math/logb.h" + +LIST_LOGB_TESTS(double, __llvm_libc::logb) diff --git a/libc/test/src/math/smoke/logbf_test.cpp b/libc/test/src/math/smoke/logbf_test.cpp new file mode 100644 index 0000000000000..727eefffbb174 --- /dev/null +++ b/libc/test/src/math/smoke/logbf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for logbf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LogbTest.h" + +#include "src/math/logbf.h" + +LIST_LOGB_TESTS(float, __llvm_libc::logbf) diff --git a/libc/test/src/math/smoke/logbl_test.cpp b/libc/test/src/math/smoke/logbl_test.cpp new file mode 100644 index 0000000000000..1c7b0d4e66e84 --- /dev/null +++ b/libc/test/src/math/smoke/logbl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for logbl -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LogbTest.h" + +#include "src/math/logbl.h" + +LIST_LOGB_TESTS(long double, __llvm_libc::logbl) diff --git a/libc/test/src/math/smoke/logf_test.cpp b/libc/test/src/math/smoke/logf_test.cpp new file mode 100644 index 0000000000000..20fa51735ad75 --- /dev/null +++ b/libc/test/src/math/smoke/logf_test.cpp @@ -0,0 +1,27 @@ +//===-- Unittests for logf-----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/math/logf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcLogfTest, SpecialNumbers) { + EXPECT_FP_EQ(aNaN, __llvm_libc::logf(aNaN)); + EXPECT_FP_EQ(inf, __llvm_libc::logf(inf)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::logf(neg_inf), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::logf(0.0f), FE_DIVBYZERO); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::logf(-0.0f), FE_DIVBYZERO); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::logf(-1.0f), FE_INVALID); + EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::logf(1.0f)); +} diff --git a/libc/test/src/math/smoke/lrint_test.cpp b/libc/test/src/math/smoke/lrint_test.cpp new file mode 100644 index 0000000000000..0dbfa3dd466d9 --- /dev/null +++ b/libc/test/src/math/smoke/lrint_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for lrint -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/lrint.h" + +LIST_ROUND_TO_INTEGER_TESTS_WITH_MODES(double, long, __llvm_libc::lrint) diff --git a/libc/test/src/math/smoke/lrintf_test.cpp b/libc/test/src/math/smoke/lrintf_test.cpp new file mode 100644 index 0000000000000..407813aa83c0f --- /dev/null +++ b/libc/test/src/math/smoke/lrintf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for lrintf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/lrintf.h" + +LIST_ROUND_TO_INTEGER_TESTS_WITH_MODES(float, long, __llvm_libc::lrintf) diff --git a/libc/test/src/math/smoke/lrintl_test.cpp b/libc/test/src/math/smoke/lrintl_test.cpp new file mode 100644 index 0000000000000..a4551c38de106 --- /dev/null +++ b/libc/test/src/math/smoke/lrintl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for lrintl ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/lrintl.h" + +LIST_ROUND_TO_INTEGER_TESTS_WITH_MODES(long double, long, __llvm_libc::lrintl) diff --git a/libc/test/src/math/smoke/lround_test.cpp b/libc/test/src/math/smoke/lround_test.cpp new file mode 100644 index 0000000000000..0ee580f316b44 --- /dev/null +++ b/libc/test/src/math/smoke/lround_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for lround ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/lround.h" + +LIST_ROUND_TO_INTEGER_TESTS(double, long, __llvm_libc::lround) diff --git a/libc/test/src/math/smoke/lroundf_test.cpp b/libc/test/src/math/smoke/lroundf_test.cpp new file mode 100644 index 0000000000000..7a2aad20e4f66 --- /dev/null +++ b/libc/test/src/math/smoke/lroundf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for lroundf ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/lroundf.h" + +LIST_ROUND_TO_INTEGER_TESTS(float, long, __llvm_libc::lroundf) diff --git a/libc/test/src/math/smoke/lroundl_test.cpp b/libc/test/src/math/smoke/lroundl_test.cpp new file mode 100644 index 0000000000000..c0ea339782052 --- /dev/null +++ b/libc/test/src/math/smoke/lroundl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for lroundl ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundToIntegerTest.h" + +#include "src/math/lroundl.h" + +LIST_ROUND_TO_INTEGER_TESTS(long double, long, __llvm_libc::lroundl) diff --git a/libc/test/src/math/smoke/modf_test.cpp b/libc/test/src/math/smoke/modf_test.cpp new file mode 100644 index 0000000000000..ff89517506b4f --- /dev/null +++ b/libc/test/src/math/smoke/modf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for modf ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ModfTest.h" + +#include "src/math/modf.h" + +LIST_MODF_TESTS(double, __llvm_libc::modf) diff --git a/libc/test/src/math/smoke/modff_test.cpp b/libc/test/src/math/smoke/modff_test.cpp new file mode 100644 index 0000000000000..440304ab95ac7 --- /dev/null +++ b/libc/test/src/math/smoke/modff_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for modff -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ModfTest.h" + +#include "src/math/modff.h" + +LIST_MODF_TESTS(float, __llvm_libc::modff) diff --git a/libc/test/src/math/smoke/modfl_test.cpp b/libc/test/src/math/smoke/modfl_test.cpp new file mode 100644 index 0000000000000..c98678b3e9243 --- /dev/null +++ b/libc/test/src/math/smoke/modfl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for modfl -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ModfTest.h" + +#include "src/math/modfl.h" + +LIST_MODF_TESTS(long double, __llvm_libc::modfl) diff --git a/libc/test/src/math/smoke/nextafter_test.cpp b/libc/test/src/math/smoke/nextafter_test.cpp new file mode 100644 index 0000000000000..2dde48ceb053c --- /dev/null +++ b/libc/test/src/math/smoke/nextafter_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for nextafter -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NextAfterTest.h" + +#include "src/math/nextafter.h" + +LIST_NEXTAFTER_TESTS(double, __llvm_libc::nextafter) diff --git a/libc/test/src/math/smoke/nextafterf_test.cpp b/libc/test/src/math/smoke/nextafterf_test.cpp new file mode 100644 index 0000000000000..14234c717797e --- /dev/null +++ b/libc/test/src/math/smoke/nextafterf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for nextafterf ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NextAfterTest.h" + +#include "src/math/nextafterf.h" + +LIST_NEXTAFTER_TESTS(float, __llvm_libc::nextafterf) diff --git a/libc/test/src/math/smoke/nextafterl_test.cpp b/libc/test/src/math/smoke/nextafterl_test.cpp new file mode 100644 index 0000000000000..db85d83edc690 --- /dev/null +++ b/libc/test/src/math/smoke/nextafterl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for nextafterl ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NextAfterTest.h" + +#include "src/math/nextafterl.h" + +LIST_NEXTAFTER_TESTS(long double, __llvm_libc::nextafterl) diff --git a/libc/test/src/math/smoke/remquo_test.cpp b/libc/test/src/math/smoke/remquo_test.cpp new file mode 100644 index 0000000000000..8efec397f39d3 --- /dev/null +++ b/libc/test/src/math/smoke/remquo_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for remquo ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RemQuoTest.h" + +#include "src/math/remquo.h" + +LIST_REMQUO_TESTS(double, __llvm_libc::remquo) diff --git a/libc/test/src/math/smoke/remquof_test.cpp b/libc/test/src/math/smoke/remquof_test.cpp new file mode 100644 index 0000000000000..1af4ba4e0153b --- /dev/null +++ b/libc/test/src/math/smoke/remquof_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for remquof ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RemQuoTest.h" + +#include "src/math/remquof.h" + +LIST_REMQUO_TESTS(float, __llvm_libc::remquof) diff --git a/libc/test/src/math/smoke/remquol_test.cpp b/libc/test/src/math/smoke/remquol_test.cpp new file mode 100644 index 0000000000000..e4438e83fe18c --- /dev/null +++ b/libc/test/src/math/smoke/remquol_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for remquol ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RemQuoTest.h" + +#include "src/math/remquol.h" + +LIST_REMQUO_TESTS(long double, __llvm_libc::remquol) diff --git a/libc/test/src/math/smoke/rint_test.cpp b/libc/test/src/math/smoke/rint_test.cpp new file mode 100644 index 0000000000000..eafa1a7fe6119 --- /dev/null +++ b/libc/test/src/math/smoke/rint_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for rint ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RIntTest.h" + +#include "src/math/rint.h" + +LIST_RINT_TESTS(double, __llvm_libc::rint) diff --git a/libc/test/src/math/smoke/rintf_test.cpp b/libc/test/src/math/smoke/rintf_test.cpp new file mode 100644 index 0000000000000..c15a697fac920 --- /dev/null +++ b/libc/test/src/math/smoke/rintf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for rintf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RIntTest.h" + +#include "src/math/rintf.h" + +LIST_RINT_TESTS(float, __llvm_libc::rintf) diff --git a/libc/test/src/math/smoke/rintl_test.cpp b/libc/test/src/math/smoke/rintl_test.cpp new file mode 100644 index 0000000000000..6c50873a272d0 --- /dev/null +++ b/libc/test/src/math/smoke/rintl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for rintl -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RIntTest.h" + +#include "src/math/rintl.h" + +LIST_RINT_TESTS(long double, __llvm_libc::rintl) diff --git a/libc/test/src/math/smoke/round_test.cpp b/libc/test/src/math/smoke/round_test.cpp new file mode 100644 index 0000000000000..87ce079e89afc --- /dev/null +++ b/libc/test/src/math/smoke/round_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for round -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundTest.h" + +#include "src/math/round.h" + +LIST_ROUND_TESTS(double, __llvm_libc::round) diff --git a/libc/test/src/math/smoke/roundf_test.cpp b/libc/test/src/math/smoke/roundf_test.cpp new file mode 100644 index 0000000000000..0a182dd02deee --- /dev/null +++ b/libc/test/src/math/smoke/roundf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for roundf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundTest.h" + +#include "src/math/roundf.h" + +LIST_ROUND_TESTS(float, __llvm_libc::roundf) diff --git a/libc/test/src/math/smoke/roundl_test.cpp b/libc/test/src/math/smoke/roundl_test.cpp new file mode 100644 index 0000000000000..32cf724fb7c4f --- /dev/null +++ b/libc/test/src/math/smoke/roundl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for roundl ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RoundTest.h" + +#include "src/math/roundl.h" + +LIST_ROUND_TESTS(long double, __llvm_libc::roundl) diff --git a/libc/test/src/math/smoke/scalbn_test.cpp b/libc/test/src/math/smoke/scalbn_test.cpp new file mode 100644 index 0000000000000..d4d60bf8afca1 --- /dev/null +++ b/libc/test/src/math/smoke/scalbn_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for scalbn ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ScalbnTest.h" + +#include "src/math/scalbn.h" + +LIST_SCALBN_TESTS(double, __llvm_libc::scalbn) diff --git a/libc/test/src/math/smoke/scalbnf_test.cpp b/libc/test/src/math/smoke/scalbnf_test.cpp new file mode 100644 index 0000000000000..cbf2e7e1361e1 --- /dev/null +++ b/libc/test/src/math/smoke/scalbnf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for scalbnf ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ScalbnTest.h" + +#include "src/math/scalbnf.h" + +LIST_SCALBN_TESTS(float, __llvm_libc::scalbnf) diff --git a/libc/test/src/math/smoke/scalbnl_test.cpp b/libc/test/src/math/smoke/scalbnl_test.cpp new file mode 100644 index 0000000000000..197887b2448c0 --- /dev/null +++ b/libc/test/src/math/smoke/scalbnl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for scalbnl ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ScalbnTest.h" + +#include "src/math/scalbnl.h" + +LIST_SCALBN_TESTS(long double, __llvm_libc::scalbnl) diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp new file mode 100644 index 0000000000000..102a50c428d26 --- /dev/null +++ b/libc/test/src/math/smoke/sincosf_test.cpp @@ -0,0 +1,51 @@ +//===-- Unittests for sincosf ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/sincosf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcSinCosfTest, SpecialNumbers) { + libc_errno = 0; + float sin, cos; + + __llvm_libc::sincosf(aNaN, &sin, &cos); + EXPECT_FP_EQ(aNaN, cos); + EXPECT_FP_EQ(aNaN, sin); + EXPECT_MATH_ERRNO(0); + + __llvm_libc::sincosf(0.0f, &sin, &cos); + EXPECT_FP_EQ(1.0f, cos); + EXPECT_FP_EQ(0.0f, sin); + EXPECT_MATH_ERRNO(0); + + __llvm_libc::sincosf(-0.0f, &sin, &cos); + EXPECT_FP_EQ(1.0f, cos); + EXPECT_FP_EQ(-0.0f, sin); + EXPECT_MATH_ERRNO(0); + + __llvm_libc::sincosf(inf, &sin, &cos); + EXPECT_FP_EQ(aNaN, cos); + EXPECT_FP_EQ(aNaN, sin); + EXPECT_MATH_ERRNO(EDOM); + + __llvm_libc::sincosf(neg_inf, &sin, &cos); + EXPECT_FP_EQ(aNaN, cos); + EXPECT_FP_EQ(aNaN, sin); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp new file mode 100644 index 0000000000000..e67062d74190b --- /dev/null +++ b/libc/test/src/math/smoke/sinf_test.cpp @@ -0,0 +1,40 @@ +//===-- Unittests for sinf ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/sinf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcSinfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ(aNaN, __llvm_libc::sinf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(0.0f, __llvm_libc::sinf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(-0.0f, __llvm_libc::sinf(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(aNaN, __llvm_libc::sinf(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ(aNaN, __llvm_libc::sinf(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp new file mode 100644 index 0000000000000..5926ea647490b --- /dev/null +++ b/libc/test/src/math/smoke/sinhf_test.cpp @@ -0,0 +1,67 @@ +//===-- Unittests for sinhf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/array.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/sinhf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcSinhfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ(aNaN, __llvm_libc::sinhf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(0.0f, __llvm_libc::sinhf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(-0.0f, __llvm_libc::sinhf(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(inf, __llvm_libc::sinhf(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(neg_inf, __llvm_libc::sinhf(neg_inf)); + EXPECT_MATH_ERRNO(0); +} + +// For small values, sinh(x) is x. +TEST(LlvmLibcSinhfTest, SmallValues) { + float x = float(FPBits(uint32_t(0x17800000))); + float result = __llvm_libc::sinhf(x); + EXPECT_FP_EQ(x, result); + + x = float(FPBits(uint32_t(0x00400000))); + result = __llvm_libc::sinhf(x); + EXPECT_FP_EQ(x, result); +} + +TEST(LlvmLibcSinhfTest, Overflow) { + libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::sinhf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::sinhf(float(FPBits(0x42cffff8U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION( + inf, __llvm_libc::sinhf(float(FPBits(0x42d00008U))), FE_OVERFLOW); + EXPECT_MATH_ERRNO(ERANGE); +} diff --git a/libc/test/src/math/smoke/sqrt_test.cpp b/libc/test/src/math/smoke/sqrt_test.cpp new file mode 100644 index 0000000000000..237264895bbd2 --- /dev/null +++ b/libc/test/src/math/smoke/sqrt_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for sqrt ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SqrtTest.h" + +#include "src/math/sqrt.h" + +LIST_SQRT_TESTS(double, __llvm_libc::sqrt) diff --git a/libc/test/src/math/smoke/sqrtf_test.cpp b/libc/test/src/math/smoke/sqrtf_test.cpp new file mode 100644 index 0000000000000..c7681d01569ae --- /dev/null +++ b/libc/test/src/math/smoke/sqrtf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for sqrtf------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SqrtTest.h" + +#include "src/math/sqrtf.h" + +LIST_SQRT_TESTS(float, __llvm_libc::sqrtf) diff --git a/libc/test/src/math/smoke/sqrtl_test.cpp b/libc/test/src/math/smoke/sqrtl_test.cpp new file mode 100644 index 0000000000000..c48ebb08444d0 --- /dev/null +++ b/libc/test/src/math/smoke/sqrtl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for sqrtl------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SqrtTest.h" + +#include "src/math/sqrtl.h" + +LIST_SQRT_TESTS(long double, __llvm_libc::sqrtl) diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp new file mode 100644 index 0000000000000..62cc42c0bef1a --- /dev/null +++ b/libc/test/src/math/smoke/tanf_test.cpp @@ -0,0 +1,40 @@ +//===-- Unittests for tanf ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/tanf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcTanfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ(aNaN, __llvm_libc::tanf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(0.0f, __llvm_libc::tanf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(-0.0f, __llvm_libc::tanf(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(aNaN, __llvm_libc::tanf(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ(aNaN, __llvm_libc::tanf(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); +} diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp new file mode 100644 index 0000000000000..0152de2f5ea04 --- /dev/null +++ b/libc/test/src/math/smoke/tanhf_test.cpp @@ -0,0 +1,40 @@ +//===-- Unittests for tanhf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/errno/libc_errno.h" +#include "src/math/tanhf.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include + +#include +#include + +using FPBits = __llvm_libc::fputil::FPBits; + +DECLARE_SPECIAL_CONSTANTS(float) + +TEST(LlvmLibcTanhfTest, SpecialNumbers) { + libc_errno = 0; + + EXPECT_FP_EQ(aNaN, __llvm_libc::tanhf(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(0.0f, __llvm_libc::tanhf(0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(-0.0f, __llvm_libc::tanhf(-0.0f)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(1.0f, __llvm_libc::tanhf(inf)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(-1.0f, __llvm_libc::tanhf(neg_inf)); + EXPECT_MATH_ERRNO(0); +} diff --git a/libc/test/src/math/smoke/trunc_test.cpp b/libc/test/src/math/smoke/trunc_test.cpp new file mode 100644 index 0000000000000..f0a808dcb2368 --- /dev/null +++ b/libc/test/src/math/smoke/trunc_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for trunc -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TruncTest.h" + +#include "src/math/trunc.h" + +LIST_TRUNC_TESTS(double, __llvm_libc::trunc) diff --git a/libc/test/src/math/smoke/truncf_test.cpp b/libc/test/src/math/smoke/truncf_test.cpp new file mode 100644 index 0000000000000..cbd672cfe2e21 --- /dev/null +++ b/libc/test/src/math/smoke/truncf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for truncf ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TruncTest.h" + +#include "src/math/truncf.h" + +LIST_TRUNC_TESTS(float, __llvm_libc::truncf) diff --git a/libc/test/src/math/smoke/truncl_test.cpp b/libc/test/src/math/smoke/truncl_test.cpp new file mode 100644 index 0000000000000..0cf9ab8cd5575 --- /dev/null +++ b/libc/test/src/math/smoke/truncl_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for truncl ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TruncTest.h" + +#include "src/math/truncl.h" + +LIST_TRUNC_TESTS(long double, __llvm_libc::truncl) diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py index d2e0c6036dd3a..5d4ca83be1308 100644 --- a/libcxx/utils/libcxx/test/dsl.py +++ b/libcxx/utils/libcxx/test/dsl.py @@ -186,7 +186,7 @@ def programOutput(config, program, args=None): "Failed to run program, cmd:\n{}\nstderr is:\n{}".format(runcmd, err) ) - return libcxx.test.format._parseLitOutput(out) + return out @_memoizeExpensiveOperation( diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py index ddd88f25646ea..c7c0bad681ddc 100644 --- a/libcxx/utils/libcxx/test/format.py +++ b/libcxx/utils/libcxx/test/format.py @@ -35,39 +35,6 @@ def _checkBaseSubstitutions(substitutions): for s in ["%{cxx}", "%{compile_flags}", "%{link_flags}", "%{flags}", "%{exec}"]: assert s in substitutions, "Required substitution {} was not provided".format(s) -def _parseLitOutput(fullOutput): - """ - Parse output of a Lit ShTest to extract the actual output of the contained commands. - - This takes output of the form - - $ ":" "RUN: at line 11" - $ "echo" "OUTPUT1" - # command output: - OUTPUT1 - - $ ":" "RUN: at line 12" - $ "echo" "OUTPUT2" - # command output: - OUTPUT2 - - and returns a string containing - - OUTPUT1 - OUTPUT2 - - as-if the commands had been run directly. This is a workaround for the fact - that Lit doesn't let us execute ShTest and retrieve the raw output without - injecting additional Lit output around it. - """ - parsed = '' - for output in re.split('[$]\s*":"\s*"RUN: at line \d+"', fullOutput): - if output: # skip blank lines - commandOutput = re.search("# command output:\n(.+)\n$", output, flags=re.DOTALL) - if commandOutput: - parsed += commandOutput.group(1) - return parsed - def _executeScriptInternal(test, litConfig, commands): """ Returns (stdout, stderr, exitCode, timeoutInfo, parsedCommands) @@ -79,21 +46,12 @@ def _executeScriptInternal(test, litConfig, commands): _, tmpBase = _getTempPaths(test) execDir = os.path.dirname(test.getExecPath()) res = lit.TestRunner.executeScriptInternal( - test, litConfig, tmpBase, parsedCommands, execDir + test, litConfig, tmpBase, parsedCommands, execDir, debug=False ) if isinstance(res, lit.Test.Result): # Handle failure to parse the Lit test res = ("", res.output, 127, None) (out, err, exitCode, timeoutInfo) = res - # TODO: As a temporary workaround until https://reviews.llvm.org/D81892 lands, manually - # split any stderr output that is included in stdout. It shouldn't be there, but - # the Lit internal shell conflates stderr and stdout. - conflatedErrorOutput = re.search("(# command stderr:.+$)", out, flags=re.DOTALL) - if conflatedErrorOutput: - conflatedErrorOutput = conflatedErrorOutput.group(0) - out = out[: -len(conflatedErrorOutput)] - err += conflatedErrorOutput - return (out, err, exitCode, timeoutInfo, parsedCommands) @@ -400,9 +358,8 @@ def _generateGenTest(self, testSuite, pathInSuite, litConfig, localConfig): raise RuntimeError(f"Error while trying to generate gen test\nstdout:\n{out}\n\nstderr:\n{err}") # Split the generated output into multiple files and generate one test for each file - parsed = _parseLitOutput(out) - for (subfile, content) in self._splitFile(parsed): - generatedFile = testSuite.getExecPath(pathInSuite + (subfile, )) + for subfile, content in self._splitFile(out): + generatedFile = testSuite.getExecPath(pathInSuite + (subfile,)) os.makedirs(os.path.dirname(generatedFile), exist_ok=True) with open(generatedFile, 'w') as f: f.write(content) diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S index 58ffd1b9e1fb3..5534d1734b6ba 100644 --- a/libunwind/src/UnwindRegistersSave.S +++ b/libunwind/src/UnwindRegistersSave.S @@ -305,9 +305,21 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) mflr 0 std 0, PPC64_OFFS_SRR0(3) // store lr as ssr0 PPC64_STR(1) + PPC64_STR(4) // Save r4 first since it will be used for fixing r2. +#if defined(_AIX) + // The TOC register (r2) was changed by the glue code if unw_getcontext + // is called from a different module. Save the original TOC register + // in the context if this is the case. + mflr 4 + lwz 4, 0(4) // Get the first instruction at the return address. + xoris 0, 4, 0xe841 // Is it reloading the TOC register "ld 2,40(1)"? + cmplwi 0, 0x28 + bne 0, LnoR2Fix // No need to fix up r2 if it is not. + ld 2, 40(1) // Use the saved TOC register in the stack. +LnoR2Fix: +#endif PPC64_STR(2) PPC64_STR(3) - PPC64_STR(4) PPC64_STR(5) PPC64_STR(6) PPC64_STR(7) @@ -547,9 +559,21 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) mflr 0 stw 0, 0(3) // store lr as ssr0 stw 1, 12(3) + stw 4, 24(3) // Save r4 first since it will be used for fixing r2. +#if defined(_AIX) + // The TOC register (r2) was changed by the glue code if unw_getcontext + // is called from a different module. Save the original TOC register + // in the context if this is the case. + mflr 4 + lwz 4, 0(4) // Get the instruction at the return address. + xoris 0, 4, 0x8041 // Is it reloading the TOC register "ld 2,40(1)"? + cmplwi 0, 0x14 + bne 0, LnoR2Fix // No need to fix up r2 if it is not. + lwz 2, 20(1) // Use the saved TOC register in the stack. +LnoR2Fix: +#endif stw 2, 16(3) stw 3, 20(3) - stw 4, 24(3) stw 5, 28(3) stw 6, 32(3) stw 7, 36(3) diff --git a/libunwind/test/unw_resume.pass.cpp b/libunwind/test/unw_resume.pass.cpp index 76273e4a8ef0a..08e8d4edeaf29 100644 --- a/libunwind/test/unw_resume.pass.cpp +++ b/libunwind/test/unw_resume.pass.cpp @@ -10,9 +10,6 @@ // Ensure that unw_resume() resumes execution at the stack frame identified by // cursor. -// TODO: Investigate this failure on AIX system. -// XFAIL: target={{.*}}-aix{{.*}} - // TODO: Figure out why this fails with Memory Sanitizer. // XFAIL: msan diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 5d58fc13256f9..12f7bf1dd4449 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -247,6 +247,7 @@ struct Config { bool ltoDebugPassManager; bool ltoEmitAsm; bool ltoUniqueBasicBlockSectionNames; + bool ltoValidateAllVtablesHaveTypeInfos; bool ltoWholeProgramVisibility; bool mergeArmExidx; bool mipsN32Abi = false; @@ -475,6 +476,9 @@ struct Ctx { std::atomic hasTlsIe{false}; // True if we need to reserve two .got entries for local-dynamic TLS model. std::atomic needsTlsLd{false}; + // True if all native vtable symbols have corresponding type info symbols + // during LTO. + bool ltoAllVtablesHaveTypeInfos; // Each symbol assignment and DEFINED(sym) reference is assigned an increasing // order. Each DEFINED(sym) evaluation checks whether the reference happens diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index bf3516192df39..8d16ee3bdbde9 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -110,6 +110,7 @@ void Ctx::reset() { needsTlsLd.store(false, std::memory_order_relaxed); scriptSymOrderCounter = 1; scriptSymOrder.clear(); + ltoAllVtablesHaveTypeInfos = false; } llvm::raw_fd_ostream Ctx::openAuxiliaryFile(llvm::StringRef filename, @@ -1036,6 +1037,63 @@ template static void readCallGraphsFromObjectFiles() { } } +template +static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) { + DenseSet typeInfoSymbols; + SmallSetVector vtableSymbols; + auto processVtableAndTypeInfoSymbols = [&](StringRef name) { + if (name.consume_front("_ZTI")) + typeInfoSymbols.insert(name); + else if (name.consume_front("_ZTV")) + vtableSymbols.insert(name); + }; + + // Examine all native symbol tables. + for (ELFFileBase *f : ctx.objectFiles) { + using Elf_Sym = typename ELFT::Sym; + for (const Elf_Sym &s : f->template getGlobalELFSyms()) { + if (s.st_shndx != SHN_UNDEF) { + StringRef name = check(s.getName(f->getStringTable())); + processVtableAndTypeInfoSymbols(name); + } + } + } + + for (SharedFile *f : ctx.sharedFiles) { + using Elf_Sym = typename ELFT::Sym; + for (const Elf_Sym &s : f->template getELFSyms()) { + if (s.st_shndx != SHN_UNDEF) { + StringRef name = check(s.getName(f->getStringTable())); + processVtableAndTypeInfoSymbols(name); + } + } + } + + SmallSetVector vtableSymbolsWithNoRTTI; + for (StringRef s : vtableSymbols) + if (!typeInfoSymbols.count(s)) + vtableSymbolsWithNoRTTI.insert(s); + + // Remove known safe symbols. + for (auto *arg : args.filtered(OPT_lto_known_safe_vtables)) { + StringRef knownSafeName = arg->getValue(); + if (!knownSafeName.consume_front("_ZTV")) + error("--lto-known-safe-vtables=: expected symbol to start with _ZTV, " + "but got " + + knownSafeName); + vtableSymbolsWithNoRTTI.remove(knownSafeName); + } + + ctx.ltoAllVtablesHaveTypeInfos = vtableSymbolsWithNoRTTI.empty(); + // Check for unmatched RTTI symbols + for (StringRef s : vtableSymbolsWithNoRTTI) { + message( + "--lto-validate-all-vtables-have-type-infos: RTTI missing for vtable " + "_ZTV" + + s + ", --lto-whole-program-visibility disabled"); + } +} + static DebugCompressionType getCompressionType(StringRef s, StringRef option) { DebugCompressionType type = StringSwitch(s) .Case("zlib", DebugCompressionType::Zlib) @@ -1236,6 +1294,9 @@ static void readConfigs(opt::InputArgList &args) { config->ltoWholeProgramVisibility = args.hasFlag(OPT_lto_whole_program_visibility, OPT_no_lto_whole_program_visibility, false); + config->ltoValidateAllVtablesHaveTypeInfos = + args.hasFlag(OPT_lto_validate_all_vtables_have_type_infos, + OPT_no_lto_validate_all_vtables_have_type_infos, false); config->ltoo = args::getInteger(args, OPT_lto_O, 2); if (config->ltoo > 3) error("invalid optimization level for LTO: " + Twine(config->ltoo)); @@ -2815,6 +2876,10 @@ void LinkerDriver::link(opt::InputArgList &args) { config->ltoEmitAsm || !config->thinLTOModulesToCompile.empty(); + // Handle --lto-validate-all-vtables-have-type-infos. + if (config->ltoValidateAllVtablesHaveTypeInfos) + invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args); + // Do link-time optimization if given files are LLVM bitcode files. // This compiles bitcode files into real object files. // diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index 1e68d20022c05..eb8f8d7f829a6 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -152,6 +152,9 @@ static lto::Config createConfig() { c.DwoDir = std::string(config->dwoDir); c.HasWholeProgramVisibility = config->ltoWholeProgramVisibility; + c.ValidateAllVtablesHaveTypeInfos = + config->ltoValidateAllVtablesHaveTypeInfos; + c.AllVtablesHaveTypeInfos = ctx.ltoAllVtablesHaveTypeInfos; c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty(); for (const llvm::StringRef &name : config->thinLTOModulesToCompile) diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index a3adf49e3d66e..ab61b1627e544 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -604,9 +604,14 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">, defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch", "turn on warnings about profile cfg mismatch (default)", "turn off warnings about profile cfg mismatch">; +defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", + "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">; def lto_obj_path_eq: JJ<"lto-obj-path=">; def lto_sample_profile: JJ<"lto-sample-profile=">, HelpText<"Sample profile file path">; +defm lto_validate_all_vtables_have_type_infos: BB<"lto-validate-all-vtables-have-type-infos", + "Validate that all vtables have type infos for LTO link", + "Do not validate that all vtables have type infos for LTO link">; defm lto_whole_program_visibility: BB<"lto-whole-program-visibility", "Asserts that the LTO link has whole program visibility", "Asserts that the LTO link does not have whole program visibility">; diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll new file mode 100644 index 0000000000000..fb357831d6f21 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll @@ -0,0 +1,26 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI6Native, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] } +@_ZTS6Native = linkonce_odr constant [8 x i8] c"6Native\00" +@_ZTI6Native = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS6Native, ptr @_ZTI1A } + +; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + + +define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll new file mode 100644 index 0000000000000..4533504c60180 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll @@ -0,0 +1,19 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] } + +define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll new file mode 100644 index 0000000000000..43df8366aa2ae --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll @@ -0,0 +1,68 @@ +;; Source code: +;; cat > a.h <<'eof' +;; struct A { virtual int foo(); }; +;; int bar(A *a); +;; eof +;; cat > b.cc <<'eof' +;; #include "a.h" +;; struct B : A { int foo() { return 2; } }; +;; int baz() { B b; return bar(&b); } +;; eof +;; clang++ -flto=thin b.cc -c + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.B = type { %struct.A } +%struct.A = type { ptr } + +@_ZTV1B = linkonce_odr dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B3fooEv] }, !type !0, !type !1, !type !2, !type !3 +@_ZTS1B = linkonce_odr dso_local constant [3 x i8] c"1B\00" +@_ZTI1A = external constant ptr +@_ZTI1B = linkonce_odr dso_local constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } +@_ZTV1A = external unnamed_addr constant { [3 x ptr] } + +define dso_local noundef i32 @_Z3bazv() #0 { +entry: + %b = alloca %struct.B + call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b) + %call = call noundef i32 @_Z3barP1A(ptr noundef %b) + ret i32 %call +} + +define linkonce_odr dso_local void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1) + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1 + ret void +} + +declare i32 @_Z3barP1A(ptr noundef) + +define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1 + ret void +} + +define linkonce_odr i32 @_ZN1B3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + ret i32 2 +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFivE.virtual"} +!2 = !{i64 16, !"_ZTS1B"} +!3 = !{i64 16, !"_ZTSM1BFivE.virtual"} diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll new file mode 100644 index 0000000000000..6cc55df82e2f2 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll @@ -0,0 +1,16 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@_ZTV1B = external unnamed_addr constant { [4 x ptr] } + +define linkonce_odr void @_ZN1BC2Ev(ptr %this) #0 { + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll new file mode 100644 index 0000000000000..74e437747df3b --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll @@ -0,0 +1,263 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cat %s > %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!12, !13}' >> %t1_regular.ll +; RUN: echo '!12 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!13 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos.ll -o %t2.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o +; RUN: ld.lld %t2.o -o %t2.so -shared + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll -o %t2_nortti.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2_nortti.bc -o %t2_nortti.o +; RUN: ld.lld %t2_nortti.o -o %t2_nortti.so -shared + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_undef.ll -o %t2_undef.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2_undef.bc -o %t2_undef.o +; RUN: ld.lld %t2_undef.o -o %t2_undef.so -shared + +;; With --lto-whole-program-visibility, we assume no native types can interfere +;; and thus proceed with devirtualization even in the presence of native types + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; With --lto-validate-all-vtables-have-type-infos, the linker checks for the presence of vtables +;; and RTTI in native files and blocks devirtualization to be conservative on correctness +;; for these types. + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t4_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t4_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t4_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t4_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; DSOs behave similarly + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.so -o %t5_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.so -o %t5_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.so -o %t5_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t5_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +; VALIDATE-NOT: single-impl: +; VALIDATE: single-impl: devirtualized a call to _ZN1D1mEi +; VALIDATE-NOT: single-impl: + +;; When vtables without type infos are detected in native files, we have a hole in our knowledge so +;; --lto-validate-all-vtables-have-type-infos conservatively disables --lto-whole-program-visibility + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.o -o %t6_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t6_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t6_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t6_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; DSOs behave similarly + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.so -o %t7_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.so -o %t7_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.so -o %t7_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t7_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +; NO-RTTI-DAG: --lto-validate-all-vtables-have-type-infos: RTTI missing for vtable _ZTV6Native, --lto-whole-program-visibility disabled +; NO-RTTI-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; --lto-known-safe-vtables=* can be used to specifically allow types to participate in WPD +;; even if they don't have corresponding RTTI + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.o -o %t8_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t8_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t8_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t8_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Only check for definitions of vtables symbols, just having a reference does not allow a type to +;; be derived from + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_undef.o -o %t9_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_undef.o -o %t9_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_undef.o -o %t9_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t9_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11 + +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00" +@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } + +@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00" +@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A } + +@_ZTS1D = internal constant [3 x i8] c"1D\00" +@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ] + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + ;; --lto-whole-program-visibility disabled so no devirtualization + ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1 + ; CHECK-NO-RTTI-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; We still have to call it as virtual. + ; CHECK-IR: %call2 = tail call i32 %fptr22 + ; CHECK-VALIDATE-IR: %call2 = tail call i32 %fptr22 + ; CHECK-NO-RTTI-IR: %call2 = tail call i32 %fptr22 + %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi + ;; Types not present in native files can still be devirtualized + ; CHECK-VALIDATE-IR: %call3 = tail call i32 @_ZN1D1mEi + ;; --lto-whole-program-visibility disabled but being local this + ;; has VCallVisibilityTranslationUnit visibility so it's still devirtualized + ; CHECK-NO-RTTI-IR: %call3 = tail call i32 @_ZN1D1mEi + %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2) + + ret i32 %call3 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS1B"} +!4 = !{i64 16, !"_ZTSM1BFviE.virtual"} +!5 = !{i64 24, !"_ZTSM1BFviE.virtual"} +!6 = !{i64 16, !"_ZTS1C"} +!7 = !{i64 16, !"_ZTSM1CFviE.virtual"} +!8 = !{i64 24, !"_ZTSM1CFviE.virtual"} +!9 = !{i64 16, !10} +!10 = distinct !{} +!11 = !{i64 2} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll new file mode 100644 index 0000000000000..15040b8707aed --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll @@ -0,0 +1,183 @@ +; REQUIRES: x86 + +; RUN: rm -rf %t.dir +; RUN: split-file %s %t.dir +; RUN: cd %t.dir + +;; Common artifacts +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1.o ThinLTO.ll +; RUN: opt -module-summary -o %t2.o RegularLTO.ll + +;; --lto-whole-program-visibility when there's split ThinLTO and a RegularLTO with summary optimizes +;; using the combined index. +; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR +; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR + +;; --lto-validate-all-vtables-have-type-infos when there's split ThinLTO and a RegularLTO with summary behaves the same +;; as everything is present in the combined index. +; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR +; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi + +;--- ThinLTO.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11 + +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00" +@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } + +@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00" +@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A } + +@_ZTS1D = internal constant [3 x i8] c"1D\00" +@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ], section "llvm.metadata" + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { + ;; Call function built with RegularLTO + %RegularLTOResult = call i32 @RegularLTO(ptr %obj, i32 %a) + + ;; ThinLTO code starts here + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; Check that the call was not devirtualized. + ; CHECK-IR: %call2 = tail call i32 %fptr22 + %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi + %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2) + + ret i32 %call3 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i32 @RegularLTO(ptr) +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1A1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS1B"} +!4 = !{i64 16, !"_ZTSM1BFviE.virtual"} +!5 = !{i64 24, !"_ZTSM1BFviE.virtual"} +!6 = !{i64 16, !"_ZTS1C"} +!7 = !{i64 16, !"_ZTSM1CFviE.virtual"} +!8 = !{i64 24, !"_ZTSM1CFviE.virtual"} +!9 = !{i64 16, !10} +!10 = distinct !{} +!11 = !{i64 2} + +;--- RegularLTO.ll +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV7Regular = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI7Regular, ptr @_ZN7Regular1fEi, ptr @_ZN1A1nEi] } , !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTS7Regular = linkonce_odr constant [9 x i8] c"7Regular\00" +@_ZTI7Regular = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS7Regular, ptr @_ZTI1A } + +; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [1 x ptr] [ ptr @_ZTV7Regular ], section "llvm.metadata" + +; CHECK-COMMON-REGULAR-IR-LABEL: define dso_local i32 @RegularLTO +define i32 @RegularLTO(ptr %obj, i32 %a) #0 { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptr1 = load ptr, ptr %vtable, align 8 + + ;; Check that the call was not devirtualized. + ; CHECK-REGULAR-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + ret i32 %call +} +; CHECK-COMMON-REGULAR-IR-LABEL: ret i32 +; CHECK-COMMON-REGULAR-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN7Regular1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } +!llvm.module.flags = !{!6, !7} + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS7Regular"} +!4 = !{i64 16, !"_ZTSM7RegularFviE.virtual"} +!5 = !{i64 24, !"_ZTSM7RegularFviE.virtual"} +!6 = !{i32 1, !"ThinLTO", i32 0} +!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll new file mode 100644 index 0000000000000..3f3ea2cc5a375 --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll @@ -0,0 +1,136 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cat %s > %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!6, !7}' >> %t1_regular.ll +; RUN: echo '!6 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +;; With --lto-whole-program-visibility, we assume no native types can interfere +;; and thus proceed with devirtualization even in the presence of native types + +;; Index based WPD +; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; With --lto-whole-program-visibility and --lto-validate-all-vtables-have-type-infos +;; we rely on resolutions on the typename symbol to inform us of what's outside the summary. +;; Without the typename symbol in the LTO unit (e.g. RTTI disabled) this causes +;; conservative disablement of WPD on these types unless it's local + +;; Index based WPD +; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +; VALIDATE-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !2 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5 + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ] + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + ;; No resolution for _ZTS1A means we don't devirtualize + ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; We still have to call it as virtual. + ; CHECK-IR: %call3 = tail call i32 %fptr22 + ; CHECK-VALIDATE-IR: %call3 = tail call i32 %fptr22 + %call3 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !4) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call4 = tail call i32 @_ZN1D1mEi + ;; Being local this has VCallVisibilityTranslationUnit + ;; visibility so it's still devirtualized + ; CHECK-VALIDATE-IR: %call4 = tail call i32 @_ZN1D1mEi + %call4 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call3) + ret i32 %call4 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTS1B"} +!2 = !{i64 16, !"_ZTS1C"} +!3 = !{i64 16, !4} +!4 = distinct !{} +!5 = !{i64 2} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll new file mode 100644 index 0000000000000..2318af4b16ab4 --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll @@ -0,0 +1,130 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cat %s > %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!2, !3}' >> %t1_regular.ll +; RUN: echo '!2 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!3 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_ref.ll -o %t2.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o + +;; Native objects can contain only a reference to the base type infos if the base declaration has no key functions. +;; Because of that, --lto-validate-all-vtables-have-type-infos needs to query for the type info symbol inside native files rather than the +;; type name symbol that's used as the key in !type metadata to correctly stop devirtualization on the native type. + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +; CHECK-NOT: single-impl: devirtualized a call to _ZN1A3fooEv + +;; Source code: +;; cat > a.h <<'eof' +;; struct A { virtual int foo(); }; +;; int bar(A *a); +;; eof +;; cat > main.cc <<'eof' +;; #include "a.h" +;; +;; int A::foo() { return 1; } +;; int bar(A *a) { return a->foo(); } +;; +;; extern int baz(); +;; int main() { +;; A a; +;; int i = bar(&a); +;; int j = baz(); +;; return i + j; +;; } +;; eof +;; clang++ -fwhole-program-vtables -fno-split-lto-unit -flto=thin main.cc -c + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { %struct.Abase } +%struct.Abase = type { ptr } + +@_ZTV1A = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv] }, align 8, !type !0, !type !1 +@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1 +@_ZTI1A = dso_local constant { ptr, ptr } { ptr null, ptr @_ZTS1A }, align 8 + +define dso_local noundef i32 @_ZN1A3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + ret i32 1 +} + +; CHECK-IR: define dso_local noundef i32 @_Z3barP1A +define dso_local noundef i32 @_Z3barP1A(ptr noundef %a) #0 { +entry: + %a.addr = alloca ptr + store ptr %a, ptr %a.addr + %0 = load ptr, ptr %a.addr + %vtable = load ptr, ptr %0 + %1 = call i1 @llvm.public.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %1) + %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0 + %fptr = load ptr, ptr %vfn + ;; Check that the call was not devirtualized. + ; CHECK-IR: %call = call noundef i32 %fptr + %call = call noundef i32 %fptr(ptr noundef nonnull align 8 dereferenceable(8) %0) + ret i32 %call +} +; CHECK-IR: ret i32 +; CHECK-IR: } + +declare i1 @llvm.public.type.test(ptr, metadata) +declare void @llvm.assume(i1 noundef) + +define dso_local noundef i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %a = alloca %struct.A, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store i32 0, ptr %retval, align 4 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a) + %call = call noundef i32 @_Z3barP1A(ptr noundef %a) + store i32 %call, ptr %i, align 4 + %call1 = call noundef i32 @_Z3bazv() + store i32 %call1, ptr %j, align 4 + %0 = load i32, ptr %i, align 4 + %1 = load i32, ptr %j, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +declare noundef i32 @_Z3bazv() + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFivE.virtual"} diff --git a/lld/test/ELF/pr37735.s b/lld/test/ELF/pr37735.s deleted file mode 100644 index a9d188a58d6ed..0000000000000 --- a/lld/test/ELF/pr37735.s +++ /dev/null @@ -1,12 +0,0 @@ -# REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=i386-pc-linux-gnu %s -o %t.o -# RUN: ld.lld -r %t.o %t.o -o %t1.o -# RUN: llvm-objdump -s --section=.bar %t1.o | FileCheck %s - -.section .foo - .byte 0 - -# CHECK: Contents of section .bar: -# CHECK-NEXT: 0000 00000000 01000000 -.section .bar - .dc.a .foo diff --git a/lld/test/ELF/relocatable-section-symbol.s b/lld/test/ELF/relocatable-section-symbol.s index 6657da2032fc0..9d8892236304b 100644 --- a/lld/test/ELF/relocatable-section-symbol.s +++ b/lld/test/ELF/relocatable-section-symbol.s @@ -1,50 +1,77 @@ -# REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# REQUIRES: x86, zlib +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o # RUN: ld.lld -r -o %t %t.o %t.o -# RUN: llvm-readobj -r %t | FileCheck --check-prefix=RELA %s +# RUN: llvm-readelf -r -x .data -x .bar -x .debug_line %t | FileCheck --check-prefix=RELA %s -# RELA: Relocations [ -# RELA-NEXT: Section ({{.*}}) .rela.data { -# RELA-NEXT: 0x0 R_X86_64_32 .text 0x1 -# RELA-NEXT: 0x4 R_X86_64_32 .text 0x5 -# RELA-NEXT: } -# RELA-NEXT: ] +# RELA: Offset Info Type Symbol's Value Symbol's Name + Addend +# RELA-NEXT: 0000000000000000 {{.*}} R_X86_64_32 0000000000000000 .text + 1 +# RELA-NEXT: 0000000000000004 {{.*}} R_X86_64_32 0000000000000000 .text + 5 +# RELA-EMPTY: +# RELA: Offset Info Type Symbol's Value Symbol's Name + Addend +# RELA-NEXT: 0000000000000000 {{.*}} R_X86_64_64 0000000000000000 .foo + 1 +# RELA-NEXT: 0000000000000008 {{.*}} R_X86_64_32 0000000000000000 .text + 0 +# RELA-NEXT: 000000000000000c {{.*}} R_X86_64_64 0000000000000000 .foo + 2 +# RELA-NEXT: 0000000000000014 {{.*}} R_X86_64_32 0000000000000000 .text + 4 +# RELA-EMPTY: +# RELA: Offset Info Type Symbol's Value Symbol's Name + Addend +# RELA-NEXT: 0000000000000000 {{.*}} R_X86_64_64 0000000000000000 .foo + 1 +# RELA-NEXT: 0000000000000008 {{.*}} R_X86_64_32 0000000000000000 .text + 0 +# RELA-NEXT: 000000000000000c {{.*}} R_X86_64_64 0000000000000000 .foo + 2 +# RELA-NEXT: 0000000000000014 {{.*}} R_X86_64_32 0000000000000000 .text + 4 +# RELA: Hex dump of section '.data': +# RELA-NEXT: 0x00000000 00000000 00000000 ........ +# RELA: Hex dump of section '.bar': +# RELA-NEXT: 0x00000000 00000000 00000000 00000000 00000000 ................ +# RELA-NEXT: 0x00000010 00000000 00000000 ........ +# RELA: Hex dump of section '.debug_line': +# RELA-NEXT: 0x00000000 00000000 00000000 00000000 00000000 ................ +# RELA-NEXT: 0x00000010 00000000 00000000 ........ -# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux %s -o %t.o -# RUN: ld.lld -r -o %t %t.o %t.o -# RUN: llvm-readobj -r -S --section-data %t | FileCheck --check-prefix=REL %s - - -# REL: Section { -# REL: Index: -# REL: Name: .data -# REL-NEXT: Type: SHT_PROGBITS -# REL-NEXT: Flags [ -# REL-NEXT: SHF_ALLOC -# REL-NEXT: SHF_WRITE -# REL-NEXT: ] -# REL-NEXT: Address: -# REL-NEXT: Offset: -# REL-NEXT: Size: -# REL-NEXT: Link: -# REL-NEXT: Info: -# REL-NEXT: AddressAlignment: -# REL-NEXT: EntrySize: -# REL-NEXT: SectionData ( -# REL-NEXT: 0000: 01000000 05000000 | -# REL-NEXT: ) -# REL-NEXT: } - - -# REL: Relocations [ -# REL-NEXT: Section ({{.*}}) .rel.data { -# REL-NEXT: 0x0 R_386_32 .text -# REL-NEXT: 0x4 R_386_32 .text -# REL-NEXT: } -# REL-NEXT: ] +# RUN: llvm-mc -filetype=obj -triple=i686 %s -o %t1.o +# RUN: ld.lld -r -o %t1 %t1.o %t1.o +# RUN: llvm-readelf -r -x .data -x .bar -x .debug_line %t1 | FileCheck %s --check-prefixes=REL,REL0 +# RUN: ld.lld -r --compress-debug-sections=zlib -o %t1.zlib %t1.o %t1.o +# RUN: llvm-objcopy --decompress-debug-sections %t1.zlib %t1.zlib.de +# RUN: llvm-readelf -r -x .data -x .bar -x .debug_line %t1.zlib.de | FileCheck %s --check-prefixes=REL,REL1 + +# REL: Offset Info Type Sym. Value Symbol's Name +# REL-NEXT: 00000000 {{.*}} R_386_32 00000000 .text +# REL-NEXT: 00000004 {{.*}} R_386_32 00000000 .text +# REL-EMPTY: +# REL: Offset Info Type Sym. Value Symbol's Name +# REL-NEXT: 00000000 {{.*}} R_386_32 00000000 .foo +# REL-NEXT: 00000004 {{.*}} R_386_32 00000000 .text +# REL-NEXT: 00000008 {{.*}} R_386_32 00000000 .foo +# REL-NEXT: 0000000c {{.*}} R_386_32 00000000 .text +# REL-EMPTY: +# REL: Offset Info Type Sym. Value Symbol's Name +# REL-NEXT: 00000000 {{.*}} R_386_32 00000000 .foo +# REL-NEXT: 00000004 {{.*}} R_386_32 00000000 .text +# REL-NEXT: 00000008 {{.*}} R_386_32 00000000 .foo +# REL-NEXT: 0000000c {{.*}} R_386_32 00000000 .text +# REL: Hex dump of section '.data': +# REL-NEXT: 0x00000000 01000000 05000000 ........ +# REL: Hex dump of section '.bar': +# REL-NEXT: 0x00000000 01000000 00000000 02000000 04000000 ................ +# REL0: Hex dump of section '.debug_line': +# REL0-NEXT: 0x00000000 01000000 00000000 02000000 04000000 ................ +## FIXME: https://github.com/llvm/llvm-project/issues/66738 The implicit addends for the second input section are wrong. +# REL1: Hex dump of section '.debug_line': +# REL1-NEXT: 0x00000000 01000000 00000000 01000000 00000000 ................ .long 42 .data .long .text + 1 + +.section .foo +.byte 0 + +.section .bar +.dc.a .foo + 1 +.dc.l .text + +.section .debug_line +.dc.a .foo + 1 +.dc.l .text diff --git a/lldb/docs/resources/contributing.rst b/lldb/docs/resources/contributing.rst index 54917f1ce8175..5ac4afb82a074 100644 --- a/lldb/docs/resources/contributing.rst +++ b/lldb/docs/resources/contributing.rst @@ -18,19 +18,45 @@ Please refer to the `LLVM Developer Policy authoring and uploading a patch. LLDB differs from the LLVM Developer Policy in the following respects. - - **Test infrastructure**: Like LLVM it is important to submit tests with your - patches, but note that LLDB uses a different system for tests. Refer to the - `test documentation `_ for more details and the ``lldb/test`` - folder on disk for examples. - - - **Coding Style**: LLDB's code style differs from - `LLVM's coding style `_. - Unfortunately there is no document describing the differences. Please be - consistent with the existing code. - For anything not explicitly listed here, assume that LLDB follows the LLVM policy. +Coding Style +++++++++++++ + +LLDB's code style differs from `LLVM's coding style `_ +in a few ways. The 2 main ones are: + +* `Variable and function naming `_: + + * Variables are ``snake_case``. + + * Functions and methods are ``UpperCamelCase``. + + * Static, global and member variables have ``s_``, ``g_`` and ``_m`` + prefixes respectively. + +* `Use of asserts `_: + See the :ref:`section below`. + +For any other contradications, consider the +`golden rule `_ +before choosing to update the style of existing code. + +All new code in LLDB should be formatted with clang-format. Existing code may +not conform and should be updated prior to being modified. Bulk reformatting +is discouraged. + +Test Infrastructure ++++++++++++++++++++ + +Like LLVM it is important to submit tests with your patches, but note that a +subset of LLDB tests (the API tests) use a different system. Refer to the +`test documentation `_ for more details and the +`lldb/test `_ folder +for examples. + +.. _Error handling: Error handling and use of assertions in LLDB -------------------------------------------- @@ -42,14 +68,13 @@ be extra thoughtful about how to handle errors. Below are a couple rules of thumb: * Invalid input. To deal with invalid input, such as malformed DWARF, - missing object files, or otherwise inconsistent debug info, LLVM's + missing object files, or otherwise inconsistent debug info, error handling types such as `llvm::Expected `_ or - `std::optional - `_ should be - used. Functions that may fail should return their result using these - wrapper types instead of using a bool to indicate success. Returning - a default value when an error occurred is also discouraged. + ``std::optional`` should be used. Functions that may fail + should return their result using these wrapper types instead of + using a bool to indicate success. Returning a default value when an + error occurred is also discouraged. * Assertions. Assertions (from ``assert.h``) should be used liberally to assert internal consistency. Assertions shall **never** be @@ -71,16 +96,18 @@ rules of thumb: behaves like ``assert()``. When asserts are disabled, it will print a warning and encourage the user to file a bug report, similar to LLVM's crash handler, and then return execution. Use these sparingly - and only if error handling is not otherwise feasible. Specifically, - new code should not be using ``lldbassert()`` and existing - uses should be replaced by other means of error handling. + and only if error handling is not otherwise feasible. + +.. note:: + + New code should not be using ``lldbassert()`` and existing uses should + be replaced by other means of error handling. * Fatal errors. Aborting LLDB's process using ``llvm::report_fatal_error()`` or ``abort()`` should be avoided at all - costs. It's acceptable to use `llvm_unreachable() - `_ for - actually unreachable code such as the default in an otherwise - exhaustive switch statement. + costs. It's acceptable to use ``llvm_unreachable()`` for actually + unreachable code such as the default in an otherwise exhaustive + switch statement. Overall, please keep in mind that the debugger is often used as a last resort, and a crash in the debugger is rarely appreciated by the diff --git a/lldb/include/lldb/Target/DynamicRegisterInfo.h b/lldb/include/lldb/Target/DynamicRegisterInfo.h index 22ad6335fe438..fb22885e713d6 100644 --- a/lldb/include/lldb/Target/DynamicRegisterInfo.h +++ b/lldb/include/lldb/Target/DynamicRegisterInfo.h @@ -93,6 +93,8 @@ class DynamicRegisterInfo { return llvm::iterator_range(m_regs); } + void ConfigureOffsets(); + protected: // Classes that inherit from DynamicRegisterInfo can see and modify these typedef std::vector set_collection; @@ -116,8 +118,6 @@ class DynamicRegisterInfo { void Finalize(const lldb_private::ArchSpec &arch); - void ConfigureOffsets(); - reg_collection m_regs; set_collection m_sets; set_reg_num_collection m_set_reg_nums; diff --git a/lldb/include/lldb/Utility/RegisterValue.h b/lldb/include/lldb/Utility/RegisterValue.h index a86767dfaf512..49aaf68be17fc 100644 --- a/lldb/include/lldb/Utility/RegisterValue.h +++ b/lldb/include/lldb/Utility/RegisterValue.h @@ -33,7 +33,9 @@ class RegisterValue { // byte AArch64 SVE. kTypicalRegisterByteSize = 256u, // Anything else we'll heap allocate storage for it. - kMaxRegisterByteSize = kTypicalRegisterByteSize, + // 256x256 to support 256 byte AArch64 SME's array storage (ZA) register. + // Which is a square of vector length x vector length. + kMaxRegisterByteSize = 256u * 256u, }; typedef llvm::SmallVector BytesContainer; diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp index df83a833dfaf9..7d246eeb5fc9b 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp @@ -41,6 +41,10 @@ 0x40b /* ARM Scalable Matrix Extension, Streaming SVE mode */ #endif +#ifndef NT_ARM_ZA +#define NT_ARM_ZA 0x40c /* ARM Scalable Matrix Extension, Array Storage */ +#endif + #ifndef NT_ARM_PAC_MASK #define NT_ARM_PAC_MASK 0x406 /* Pointer authentication code masks */ #endif @@ -90,6 +94,16 @@ NativeRegisterContextLinux::CreateHostNativeRegisterContextLinux( opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskSSVE); } + sve::user_za_header za_header; + ioVec.iov_base = &za_header; + ioVec.iov_len = sizeof(za_header); + regset = NT_ARM_ZA; + if (NativeProcessLinux::PtraceWrapper(PTRACE_GETREGSET, + native_thread.GetID(), ®set, + &ioVec, sizeof(za_header)) + .Success()) + opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskZA); + NativeProcessLinux &process = native_thread.GetProcess(); std::optional auxv_at_hwcap = @@ -133,6 +147,7 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64( ::memset(&m_sve_header, 0, sizeof(m_sve_header)); ::memset(&m_pac_mask, 0, sizeof(m_pac_mask)); ::memset(&m_tls_regs, 0, sizeof(m_tls_regs)); + ::memset(&m_sme_pseudo_regs, 0, sizeof(m_sme_pseudo_regs)); m_mte_ctrl_reg = 0; @@ -314,6 +329,41 @@ NativeRegisterContextLinux_arm64::ReadRegister(const RegisterInfo *reg_info, offset = reg_info->byte_offset - GetRegisterInfo().GetMTEOffset(); assert(offset < GetMTEControlSize()); src = (uint8_t *)GetMTEControl() + offset; + } else if (IsSME(reg)) { + if (GetRegisterInfo().IsSMERegZA(reg)) { + error = ReadZAHeader(); + if (error.Fail()) + return error; + + // If there is only a header and no registers, ZA is inactive. Read as 0 + // in this case. + if (m_za_header.size == sizeof(m_za_header)) { + // This will get reconfigured/reset later, so we are safe to use it. + // ZA is a square of VL * VL and the ptrace buffer also includes the + // header itself. + m_za_ptrace_payload.resize(((m_za_header.vl) * (m_za_header.vl)) + + GetZAHeaderSize()); + std::fill(m_za_ptrace_payload.begin(), m_za_ptrace_payload.end(), 0); + } else { + // ZA is active, read the real register. + error = ReadZA(); + if (error.Fail()) + return error; + } + + // ZA is part of the SME set but uses a seperate member buffer for + // storage. Therefore its effective byte offset is always 0 even if it + // isn't 0 within the SME register set. + src = (uint8_t *)GetZABuffer() + GetZAHeaderSize(); + } else { + error = ReadSMESVG(); + if (error.Fail()) + return error; + + offset = reg_info->byte_offset - GetRegisterInfo().GetSMEOffset(); + assert(offset < GetSMEPseudoBufferSize()); + src = (uint8_t *)GetSMEPseudoBuffer() + offset; + } } else return Status("failed - register wasn't recognized to be a GPR or an FPR, " "write strategy unknown"); @@ -420,8 +470,12 @@ Status NativeRegisterContextLinux_arm64::WriteRegister( SetSVERegVG(vg_value); error = WriteSVEHeader(); - if (error.Success()) + if (error.Success()) { + // Changing VG during streaming mode also changes the size of ZA. + if (m_sve_state == SVEState::Streaming) + m_za_header_is_valid = false; ConfigureRegisterContext(); + } if (m_sve_header_is_valid && vg_value == GetSVERegVG()) return error; @@ -494,6 +548,23 @@ Status NativeRegisterContextLinux_arm64::WriteRegister( ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size); return WriteTLS(); + } else if (IsSME(reg)) { + if (!GetRegisterInfo().IsSMERegZA(reg)) + return Status("Writing to SVG is not supported."); + + error = ReadZA(); + if (error.Fail()) + return error; + + // ZA is part of the SME set but not stored with the other SME registers. + // So its byte offset is effectively always 0. + dst = (uint8_t *)GetZABuffer() + GetZAHeaderSize(); + ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size); + + // While this is writing a header that contains a vector length, the only + // way to change that is via the vg register. So here we assume the length + // will always be the current length and no reconfigure is needed. + return WriteZA(); } return Status("Failed to write register value"); @@ -503,8 +574,10 @@ enum RegisterSetType : uint32_t { GPR, SVE, // Used for SVE and SSVE. FPR, // When there is no SVE, or SVE in FPSIMD mode. + // Pointer authentication registers are read only, so not included here. MTE, TLS, + SME, // ZA only, SVCR and SVG are pseudo registers. }; static uint8_t *AddRegisterSetType(uint8_t *dst, @@ -533,6 +606,24 @@ NativeRegisterContextLinux_arm64::CacheAllRegisters(uint32_t &cached_size) { if (error.Fail()) return error; + // Here this means, does the system have ZA, not whether it is active. + if (GetRegisterInfo().IsZAEnabled()) { + error = ReadZAHeader(); + if (error.Fail()) + return error; + // Use header size here because the buffer may contain fake data when ZA is + // disabled. We do not want to write this fake data (all 0s) because this + // would tell the kernel that we want ZA to become active. Which is the + // opposite of what we want in the case where it is currently inactive. + cached_size += sizeof(RegisterSetType) + m_za_header.size; + // For the same reason, we need to force it to be re-read so that it will + // always contain the real header. + m_za_buffer_is_valid = false; + error = ReadZA(); + if (error.Fail()) + return error; + } + // If SVE is enabled we need not copy FPR separately. if (GetRegisterInfo().IsSVEEnabled() || GetRegisterInfo().IsSSVEEnabled()) { // Store mode and register data. @@ -583,6 +674,45 @@ Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues( dst = AddSavedRegisters(dst, RegisterSetType::GPR, GetGPRBuffer(), GetGPRBufferSize()); + // Streaming SVE and the ZA register both use the streaming vector length. + // When you change this, the kernel will invalidate parts of the process + // state. Therefore we need a specific order of restoration for each mode, if + // we also have ZA to restore. + // + // Streaming mode enabled, ZA enabled: + // * Write streaming registers. This sets SVCR.SM and clears SVCR.ZA. + // * Write ZA, this set SVCR.ZA. The register data we provide is written to + // ZA. + // * Result is SVCR.SM and SVCR.ZA set, with the expected data in both + // register sets. + // + // Streaming mode disabled, ZA enabled: + // * Write ZA. This sets SVCR.ZA, and the ZA content. In the majority of cases + // the streaming vector length is changing, so the thread is converted into + // an FPSIMD thread if it is not already one. This also clears SVCR.SM. + // * Write SVE registers, which also clears SVCR.SM but most importantly, puts + // us into full SVE mode instead of FPSIMD mode (where the registers are + // actually the 128 bit Neon registers). + // * Result is we have SVCR.SM = 0, SVCR.ZA = 1 and the expected register + // state. + // + // Restoring in different orders leads to things like the SVE registers being + // truncated due to the FPSIMD mode and ZA being disabled or filled with 0s + // (disabled and 0s looks the same from inside lldb since we fake the value + // when it's disabled). + // + // For more information on this, look up the uses of the relevant NT_ARM_ + // constants and the functions vec_set_vector_length, sve_set_common and + // za_set in the Linux Kernel. + + if ((m_sve_state != SVEState::Streaming) && GetRegisterInfo().IsZAEnabled()) { + // Use the header size not the buffer size, as we may be using the buffer + // for fake data, which we do not want to write out. + assert(m_za_header.size <= GetZABufferSize()); + dst = AddSavedRegisters(dst, RegisterSetType::SME, GetZABuffer(), + m_za_header.size); + } + if (GetRegisterInfo().IsSVEEnabled() || GetRegisterInfo().IsSSVEEnabled()) { dst = AddRegisterSetType(dst, RegisterSetType::SVE); *(reinterpret_cast(dst)) = m_sve_state; @@ -593,6 +723,12 @@ Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues( GetFPRSize()); } + if ((m_sve_state == SVEState::Streaming) && GetRegisterInfo().IsZAEnabled()) { + assert(m_za_header.size <= GetZABufferSize()); + dst = AddSavedRegisters(dst, RegisterSetType::SME, GetZABuffer(), + m_za_header.size); + } + if (GetRegisterInfo().IsMTEEnabled()) { dst = AddSavedRegisters(dst, RegisterSetType::MTE, GetMTEControl(), GetMTEControlSize()); @@ -685,6 +821,8 @@ Status NativeRegisterContextLinux_arm64::WriteAllRegisterValues( return error; // SVE header has been written configure SVE vector length if needed. + // This could change ZA data too, but that will be restored again later + // anyway. ConfigureRegisterContext(); // Write header and register data, incrementing src this time. @@ -707,6 +845,33 @@ Status NativeRegisterContextLinux_arm64::WriteAllRegisterValues( GetTLSBuffer(), &src, GetTLSBufferSize(), m_tls_is_valid, std::bind(&NativeRegisterContextLinux_arm64::WriteTLS, this)); break; + case RegisterSetType::SME: + // To enable or disable ZA you write the regset with or without register + // data. The kernel detects this by looking at the ioVec's length, not the + // ZA header size you pass in. Therefore we must write header and register + // data (if present) in one go every time. Read the header only first just + // to get the size. + ::memcpy(GetZAHeader(), src, GetZAHeaderSize()); + // Read the header and register data. Can't use the buffer size here, it + // may be incorrect due to being filled with dummy data previously. Resize + // this so WriteZA uses the correct size. + m_za_ptrace_payload.resize(m_za_header.size); + ::memcpy(GetZABuffer(), src, GetZABufferSize()); + m_za_buffer_is_valid = true; + + error = WriteZA(); + if (error.Fail()) + return error; + + // Update size of ZA, which resizes the ptrace payload potentially + // trashing our copy of the data we just wrote. + ConfigureRegisterContext(); + + // ZA buffer now has proper size, read back the data we wrote above, from + // ptrace. + error = ReadZA(); + src += GetZABufferSize(); + break; } if (error.Fail()) @@ -734,6 +899,10 @@ bool NativeRegisterContextLinux_arm64::IsSVE(unsigned reg) const { return GetRegisterInfo().IsSVEReg(reg); } +bool NativeRegisterContextLinux_arm64::IsSME(unsigned reg) const { + return GetRegisterInfo().IsSMEReg(reg); +} + bool NativeRegisterContextLinux_arm64::IsPAuth(unsigned reg) const { return GetRegisterInfo().IsPAuthReg(reg); } @@ -887,11 +1056,13 @@ void NativeRegisterContextLinux_arm64::InvalidateAllRegisters() { m_fpu_is_valid = false; m_sve_buffer_is_valid = false; m_sve_header_is_valid = false; + m_za_buffer_is_valid = false; + m_za_header_is_valid = false; m_pac_mask_is_valid = false; m_mte_ctrl_is_valid = false; m_tls_is_valid = false; - // Update SVE registers in case there is change in configuration. + // Update SVE and ZA registers in case there is change in configuration. ConfigureRegisterContext(); } @@ -1057,6 +1228,62 @@ Status NativeRegisterContextLinux_arm64::WriteTLS() { return WriteRegisterSet(&ioVec, GetTLSBufferSize(), NT_ARM_TLS); } +Status NativeRegisterContextLinux_arm64::ReadZAHeader() { + Status error; + + if (m_za_header_is_valid) + return error; + + struct iovec ioVec; + ioVec.iov_base = GetZAHeader(); + ioVec.iov_len = GetZAHeaderSize(); + + error = ReadRegisterSet(&ioVec, GetZAHeaderSize(), NT_ARM_ZA); + + if (error.Success()) + m_za_header_is_valid = true; + + return error; +} + +Status NativeRegisterContextLinux_arm64::ReadZA() { + Status error; + + if (m_za_buffer_is_valid) + return error; + + struct iovec ioVec; + ioVec.iov_base = GetZABuffer(); + ioVec.iov_len = GetZABufferSize(); + + error = ReadRegisterSet(&ioVec, GetZABufferSize(), NT_ARM_ZA); + + if (error.Success()) + m_za_buffer_is_valid = true; + + return error; +} + +Status NativeRegisterContextLinux_arm64::WriteZA() { + // Note that because the ZA ptrace payload contains the header also, this + // method will write both. This is done because writing only the header + // will disable ZA, even if .size in the header is correct for an enabled ZA. + Status error; + + error = ReadZA(); + if (error.Fail()) + return error; + + struct iovec ioVec; + ioVec.iov_base = GetZABuffer(); + ioVec.iov_len = GetZABufferSize(); + + m_za_buffer_is_valid = false; + m_za_header_is_valid = false; + + return WriteRegisterSet(&ioVec, GetZABufferSize(), NT_ARM_ZA); +} + void NativeRegisterContextLinux_arm64::ConfigureRegisterContext() { // ConfigureRegisterContext gets called from InvalidateAllRegisters // on every stop and configures SVE vector length and whether we are in @@ -1097,15 +1324,28 @@ void NativeRegisterContextLinux_arm64::ConfigureRegisterContext() { if (m_sve_state == SVEState::Full || m_sve_state == SVEState::FPSIMD || m_sve_state == SVEState::Streaming) { // On every stop we configure SVE vector length by calling - // ConfigureVectorLength regardless of current SVEState of this thread. + // ConfigureVectorLengthSVE regardless of current SVEState of this thread. uint32_t vq = RegisterInfoPOSIX_arm64::eVectorQuadwordAArch64SVE; if (sve::vl_valid(m_sve_header.vl)) vq = sve::vq_from_vl(m_sve_header.vl); - GetRegisterInfo().ConfigureVectorLength(vq); + GetRegisterInfo().ConfigureVectorLengthSVE(vq); m_sve_ptrace_payload.resize(sve::PTraceSize(vq, sve::ptrace_regs_sve)); } } + + if (!m_za_header_is_valid) { + Status error = ReadZAHeader(); + if (error.Success()) { + uint32_t vq = RegisterInfoPOSIX_arm64::eVectorQuadwordAArch64SVE; + if (sve::vl_valid(m_za_header.vl)) + vq = sve::vq_from_vl(m_za_header.vl); + + GetRegisterInfo().ConfigureVectorLengthZA(vq); + m_za_ptrace_payload.resize(m_za_header.size); + m_za_buffer_is_valid = false; + } + } } uint32_t NativeRegisterContextLinux_arm64::CalculateFprOffset( @@ -1131,12 +1371,27 @@ uint32_t NativeRegisterContextLinux_arm64::CalculateSVEOffset( return sve_reg_offset; } +Status NativeRegisterContextLinux_arm64::ReadSMESVG() { + // This register is the streaming vector length, so we will get it from + // NT_ARM_ZA regardless of the current streaming mode. + Status error = ReadZAHeader(); + if (error.Success()) + m_sme_pseudo_regs.svg_reg = m_za_header.vl / 8; + + return error; +} + std::vector NativeRegisterContextLinux_arm64::GetExpeditedRegisters( ExpeditedRegs expType) const { std::vector expedited_reg_nums = NativeRegisterContext::GetExpeditedRegisters(expType); + // SVE, non-streaming vector length. if (m_sve_state == SVEState::FPSIMD || m_sve_state == SVEState::Full) expedited_reg_nums.push_back(GetRegisterInfo().GetRegNumSVEVG()); + // SME, streaming vector length. This is used by the ZA register which is + // present even when streaming mode is not enabled. + if (GetRegisterInfo().IsSSVEEnabled()) + expedited_reg_nums.push_back(GetRegisterInfo().GetRegNumSMESVG()); return expedited_reg_nums; } diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h index ac87397f0782d..20e262fcd0f3d 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h @@ -85,6 +85,8 @@ class NativeRegisterContextLinux_arm64 bool m_mte_ctrl_is_valid; bool m_sve_header_is_valid; + bool m_za_buffer_is_valid; + bool m_za_header_is_valid; bool m_pac_mask_is_valid; bool m_tls_is_valid; size_t m_tls_size; @@ -98,6 +100,9 @@ class NativeRegisterContextLinux_arm64 struct sve::user_sve_header m_sve_header; std::vector m_sve_ptrace_payload; + sve::user_za_header m_za_header; + std::vector m_za_ptrace_payload; + bool m_refresh_hwdebug_info; struct user_pac_mask { @@ -109,6 +114,12 @@ class NativeRegisterContextLinux_arm64 uint64_t m_mte_ctrl_reg; + struct sme_pseudo_regs { + uint64_t svg_reg; + }; + + struct sme_pseudo_regs m_sme_pseudo_regs; + struct tls_regs { uint64_t tpidr_reg; // Only valid when SME is present. @@ -139,7 +150,20 @@ class NativeRegisterContextLinux_arm64 Status WriteTLS(); + Status ReadSMESVG(); + + Status ReadZAHeader(); + + Status ReadZA(); + + Status WriteZA(); + + // No WriteZAHeader because writing only the header will disable ZA. + // Instead use WriteZA and ensure you have the correct ZA buffer size set + // beforehand if you wish to disable it. + bool IsSVE(unsigned reg) const; + bool IsSME(unsigned reg) const; bool IsPAuth(unsigned reg) const; bool IsMTE(unsigned reg) const; bool IsTLS(unsigned reg) const; @@ -150,12 +174,18 @@ class NativeRegisterContextLinux_arm64 void *GetSVEHeader() { return &m_sve_header; } + void *GetZAHeader() { return &m_za_header; } + + size_t GetZAHeaderSize() { return sizeof(m_za_header); } + void *GetPACMask() { return &m_pac_mask; } void *GetMTEControl() { return &m_mte_ctrl_reg; } void *GetTLSBuffer() { return &m_tls_regs; } + void *GetSMEPseudoBuffer() { return &m_sme_pseudo_regs; } + void *GetSVEBuffer() { return m_sve_ptrace_payload.data(); } size_t GetSVEHeaderSize() { return sizeof(m_sve_header); } @@ -166,10 +196,16 @@ class NativeRegisterContextLinux_arm64 unsigned GetSVERegSet(); + void *GetZABuffer() { return m_za_ptrace_payload.data(); }; + + size_t GetZABufferSize() { return m_za_ptrace_payload.size(); } + size_t GetMTEControlSize() { return sizeof(m_mte_ctrl_reg); } size_t GetTLSBufferSize() { return m_tls_size; } + size_t GetSMEPseudoBufferSize() { return sizeof(m_sme_pseudo_regs); } + llvm::Error ReadHardwareDebugInfo() override; llvm::Error WriteHardwareDebugRegs(DREGType hwbType) override; diff --git a/lldb/source/Plugins/Process/Utility/LinuxPTraceDefines_arm64sve.h b/lldb/source/Plugins/Process/Utility/LinuxPTraceDefines_arm64sve.h index 817dca336de7a..8b5393ca18881 100644 --- a/lldb/source/Plugins/Process/Utility/LinuxPTraceDefines_arm64sve.h +++ b/lldb/source/Plugins/Process/Utility/LinuxPTraceDefines_arm64sve.h @@ -152,6 +152,8 @@ struct user_sve_header { uint16_t reserved; }; +using user_za_header = user_sve_header; + /* Definitions for user_sve_header.flags: */ const uint16_t ptrace_regs_mask = 1 << 0; const uint16_t ptrace_regs_fpsimd = 0; diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp index d306c818e89f4..b57538e185f71 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp +++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp @@ -43,6 +43,10 @@ bool RegisterContextPOSIX_arm64::IsSVE(unsigned reg) const { return m_register_info_up->IsSVEReg(reg); } +bool RegisterContextPOSIX_arm64::IsSME(unsigned reg) const { + return m_register_info_up->IsSMEReg(reg); +} + bool RegisterContextPOSIX_arm64::IsPAuth(unsigned reg) const { return m_register_info_up->IsPAuthReg(reg); } diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h index 6a935274fc40d..f425d9343d463 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h +++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h @@ -54,6 +54,7 @@ class RegisterContextPOSIX_arm64 : public lldb_private::RegisterContext { size_t GetFPUSize() { return sizeof(RegisterInfoPOSIX_arm64::FPU); } bool IsSVE(unsigned reg) const; + bool IsSME(unsigned reg) const; bool IsPAuth(unsigned reg) const; bool IsTLS(unsigned reg) const; diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp index 001d7406ff434..8b23f17ca573f 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp +++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp @@ -83,6 +83,12 @@ static lldb_private::RegisterInfo g_register_infos_tls[] = { // Only present when SME is present DEFINE_EXTENSION_REG(tpidr2)}; +static lldb_private::RegisterInfo g_register_infos_sme[] = { + DEFINE_EXTENSION_REG(svg), + // 16 is a default size we will change later. + {"za", nullptr, 16, 0, lldb::eEncodingVector, lldb::eFormatVectorOfUInt8, + KIND_ALL_INVALID, nullptr, nullptr, nullptr}}; + // Number of register sets provided by this context. enum { k_num_gpr_registers = gpr_w28 - gpr_x0 + 1, @@ -91,6 +97,7 @@ enum { k_num_mte_register = 1, // Number of TLS registers is dynamic so it is not listed here. k_num_pauth_register = 2, + k_num_sme_register = 2, k_num_register_sets_default = 2, k_num_register_sets = 3 }; @@ -197,6 +204,9 @@ static const lldb_private::RegisterSet g_reg_set_mte_arm64 = { // The size of the TLS set is dynamic, so not listed here. +static const lldb_private::RegisterSet g_reg_set_sme_arm64 = { + "Scalable Matrix Extension Registers", "sme", k_num_sme_register, nullptr}; + RegisterInfoPOSIX_arm64::RegisterInfoPOSIX_arm64( const lldb_private::ArchSpec &target_arch, lldb_private::Flags opt_regsets) : lldb_private::RegisterInfoAndSetInterface(target_arch), @@ -241,6 +251,9 @@ RegisterInfoPOSIX_arm64::RegisterInfoPOSIX_arm64( // present. AddRegSetTLS(m_opt_regsets.AllSet(eRegsetMaskSSVE)); + if (m_opt_regsets.AnySet(eRegsetMaskSSVE)) + AddRegSetSME(); + m_register_info_count = m_dynamic_reg_infos.size(); m_register_info_p = m_dynamic_reg_infos.data(); m_register_set_p = m_dynamic_reg_sets.data(); @@ -344,7 +357,25 @@ void RegisterInfoPOSIX_arm64::AddRegSetTLS(bool has_tpidr2) { m_dynamic_reg_sets.back().registers = m_tls_regnum_collection.data(); } -uint32_t RegisterInfoPOSIX_arm64::ConfigureVectorLength(uint32_t sve_vq) { +void RegisterInfoPOSIX_arm64::AddRegSetSME() { + uint32_t sme_regnum = m_dynamic_reg_infos.size(); + for (uint32_t i = 0; i < k_num_sme_register; i++) { + m_sme_regnum_collection.push_back(sme_regnum + i); + m_dynamic_reg_infos.push_back(g_register_infos_sme[i]); + m_dynamic_reg_infos[sme_regnum + i].byte_offset = + m_dynamic_reg_infos[sme_regnum + i - 1].byte_offset + + m_dynamic_reg_infos[sme_regnum + i - 1].byte_size; + m_dynamic_reg_infos[sme_regnum + i].kinds[lldb::eRegisterKindLLDB] = + sme_regnum + i; + } + + m_per_regset_regnum_range[m_register_set_count] = + std::make_pair(sme_regnum, m_dynamic_reg_infos.size()); + m_dynamic_reg_sets.push_back(g_reg_set_sme_arm64); + m_dynamic_reg_sets.back().registers = m_sme_regnum_collection.data(); +} + +uint32_t RegisterInfoPOSIX_arm64::ConfigureVectorLengthSVE(uint32_t sve_vq) { // sve_vq contains SVE Quad vector length in context of AArch64 SVE. // SVE register infos if enabled cannot be disabled by selecting sve_vq = 0. // Also if an invalid or previously set vector length is passed to this @@ -408,6 +439,20 @@ uint32_t RegisterInfoPOSIX_arm64::ConfigureVectorLength(uint32_t sve_vq) { return m_vector_reg_vq; } +void RegisterInfoPOSIX_arm64::ConfigureVectorLengthZA(uint32_t za_vq) { + if (!VectorSizeIsValid(za_vq) || m_za_reg_vq == za_vq) + return; + + m_za_reg_vq = za_vq; + + // For SVE changes, we replace m_register_info_p completely. ZA is in a + // dynamic set and is just 1 register so we make an exception to const here. + lldb_private::RegisterInfo *non_const_reginfo = + const_cast(m_register_info_p); + non_const_reginfo[m_sme_regnum_collection[1]].byte_size = + (za_vq * 16) * (za_vq * 16); +} + bool RegisterInfoPOSIX_arm64::IsSVEReg(unsigned reg) const { if (m_vector_reg_vq > eVectorQuadwordAArch64) return (sve_vg <= reg && reg <= sve_ffr); @@ -427,6 +472,10 @@ bool RegisterInfoPOSIX_arm64::IsSVERegVG(unsigned reg) const { return sve_vg == reg; } +bool RegisterInfoPOSIX_arm64::IsSMERegZA(unsigned reg) const { + return reg == m_sme_regnum_collection[1]; +} + bool RegisterInfoPOSIX_arm64::IsPAuthReg(unsigned reg) const { return llvm::is_contained(pauth_regnum_collection, reg); } @@ -439,6 +488,10 @@ bool RegisterInfoPOSIX_arm64::IsTLSReg(unsigned reg) const { return llvm::is_contained(m_tls_regnum_collection, reg); } +bool RegisterInfoPOSIX_arm64::IsSMEReg(unsigned reg) const { + return llvm::is_contained(m_sme_regnum_collection, reg); +} + uint32_t RegisterInfoPOSIX_arm64::GetRegNumSVEZ0() const { return sve_z0; } uint32_t RegisterInfoPOSIX_arm64::GetRegNumSVEFFR() const { return sve_ffr; } @@ -449,6 +502,10 @@ uint32_t RegisterInfoPOSIX_arm64::GetRegNumFPSR() const { return fpu_fpsr; } uint32_t RegisterInfoPOSIX_arm64::GetRegNumSVEVG() const { return sve_vg; } +uint32_t RegisterInfoPOSIX_arm64::GetRegNumSMESVG() const { + return m_sme_regnum_collection[0]; +} + uint32_t RegisterInfoPOSIX_arm64::GetPAuthOffset() const { return m_register_info_p[pauth_regnum_collection[0]].byte_offset; } @@ -460,3 +517,7 @@ uint32_t RegisterInfoPOSIX_arm64::GetMTEOffset() const { uint32_t RegisterInfoPOSIX_arm64::GetTLSOffset() const { return m_register_info_p[m_tls_regnum_collection[0]].byte_offset; } + +uint32_t RegisterInfoPOSIX_arm64::GetSMEOffset() const { + return m_register_info_p[m_sme_regnum_collection[0]].byte_offset; +} diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h index 5de7cfea14c1b..debdf4c76abc2 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h +++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h @@ -30,6 +30,7 @@ class RegisterInfoPOSIX_arm64 eRegsetMaskPAuth = 4, eRegsetMaskMTE = 8, eRegsetMaskTLS = 16, + eRegsetMaskZA = 32, eRegsetMaskDynamic = ~1, }; @@ -106,7 +107,11 @@ class RegisterInfoPOSIX_arm64 void AddRegSetTLS(bool has_tpidr2); - uint32_t ConfigureVectorLength(uint32_t sve_vq); + void AddRegSetSME(); + + uint32_t ConfigureVectorLengthSVE(uint32_t sve_vq); + + void ConfigureVectorLengthZA(uint32_t za_vq); bool VectorSizeIsValid(uint32_t vq) { // coverity[unsigned_compare] @@ -117,6 +122,7 @@ class RegisterInfoPOSIX_arm64 bool IsSVEEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskSVE); } bool IsSSVEEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskSSVE); } + bool IsZAEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskZA); } bool IsPAuthEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskPAuth); } bool IsMTEEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskMTE); } bool IsTLSEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskTLS); } @@ -128,15 +134,19 @@ class RegisterInfoPOSIX_arm64 bool IsPAuthReg(unsigned reg) const; bool IsMTEReg(unsigned reg) const; bool IsTLSReg(unsigned reg) const; + bool IsSMEReg(unsigned reg) const; + bool IsSMERegZA(unsigned reg) const; uint32_t GetRegNumSVEZ0() const; uint32_t GetRegNumSVEFFR() const; uint32_t GetRegNumFPCR() const; uint32_t GetRegNumFPSR() const; uint32_t GetRegNumSVEVG() const; + uint32_t GetRegNumSMESVG() const; uint32_t GetPAuthOffset() const; uint32_t GetMTEOffset() const; uint32_t GetTLSOffset() const; + uint32_t GetSMEOffset() const; private: typedef std::map> @@ -145,7 +155,10 @@ class RegisterInfoPOSIX_arm64 per_vq_register_infos m_per_vq_reg_infos; uint32_t m_vector_reg_vq = eVectorQuadwordAArch64; + uint32_t m_za_reg_vq = eVectorQuadwordAArch64; + // In normal operation this is const. Only when SVE or SME registers change + // size is it either replaced or the content modified. const lldb_private::RegisterInfo *m_register_info_p; uint32_t m_register_info_count; @@ -164,6 +177,7 @@ class RegisterInfoPOSIX_arm64 std::vector pauth_regnum_collection; std::vector m_mte_regnum_collection; std::vector m_tls_regnum_collection; + std::vector m_sme_regnum_collection; }; #endif diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp index 38abd8f8f2b11..e93fbfe4b6af8 100644 --- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp +++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp @@ -113,7 +113,7 @@ void RegisterContextCorePOSIX_arm64::ConfigureRegisterContext() { m_sve_state = SVEState::Disabled; if (m_sve_state != SVEState::Disabled) - m_register_info_up->ConfigureVectorLength( + m_register_info_up->ConfigureVectorLengthSVE( sve::vq_from_vl(m_sve_vector_length)); } diff --git a/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h b/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h index 3d53a5795ef3e..7691d3b92ce01 100644 --- a/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h +++ b/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h @@ -119,6 +119,10 @@ constexpr RegsetDesc AARCH64_SVE_Desc[] = { {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_SVE}, }; +constexpr RegsetDesc AARCH64_ZA_Desc[] = { + {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_ZA}, +}; + constexpr RegsetDesc AARCH64_PAC_Desc[] = { {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_PAC_MASK}, }; diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp index bd9211bbaad52..16dde28f6ee53 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp @@ -373,14 +373,14 @@ bool GDBRemoteRegisterContext::WriteRegisterBytes(const RegisterInfo *reg_info, if (dst == nullptr) return false; - // Code below is specific to AArch64 target in SVE state + // Code below is specific to AArch64 target in SVE or SME state // If vector granule (vg) register is being written then thread's // register context reconfiguration is triggered on success. - bool do_reconfigure_arm64_sve = false; + // We do not allow writes to SVG so it is not mentioned here. const ArchSpec &arch = process->GetTarget().GetArchitecture(); - if (arch.IsValid() && arch.GetTriple().isAArch64()) - if (strcmp(reg_info->name, "vg") == 0) - do_reconfigure_arm64_sve = true; + bool do_reconfigure_arm64_sve = arch.IsValid() && + arch.GetTriple().isAArch64() && + (strcmp(reg_info->name, "vg") == 0); if (data.CopyByteOrderedData(data_offset, // src offset reg_info->byte_size, // src length @@ -400,10 +400,10 @@ bool GDBRemoteRegisterContext::WriteRegisterBytes(const RegisterInfo *reg_info, {m_reg_data.GetDataStart(), size_t(m_reg_data.GetByteSize())})) { - SetAllRegisterValid(false); - if (do_reconfigure_arm64_sve) - AArch64SVEReconfigure(); + AArch64Reconfigure(); + + InvalidateAllRegisters(); return true; } @@ -435,8 +435,10 @@ bool GDBRemoteRegisterContext::WriteRegisterBytes(const RegisterInfo *reg_info, // This is an actual register, write it success = SetPrimordialRegister(reg_info, gdb_comm); - if (success && do_reconfigure_arm64_sve) - AArch64SVEReconfigure(); + if (success && do_reconfigure_arm64_sve) { + AArch64Reconfigure(); + InvalidateAllRegisters(); + } } // Check if writing this register will invalidate any other register @@ -760,37 +762,47 @@ uint32_t GDBRemoteRegisterContext::ConvertRegisterKindToRegisterNumber( return m_reg_info_sp->ConvertRegisterKindToRegisterNumber(kind, num); } -bool GDBRemoteRegisterContext::AArch64SVEReconfigure() { - if (!m_reg_info_sp) - return false; - - const RegisterInfo *reg_info = m_reg_info_sp->GetRegisterInfo("vg"); - if (!reg_info) - return false; - - uint64_t fail_value = LLDB_INVALID_ADDRESS; - uint32_t vg_reg_num = reg_info->kinds[eRegisterKindLLDB]; - uint64_t vg_reg_value = ReadRegisterAsUnsigned(vg_reg_num, fail_value); +void GDBRemoteRegisterContext::AArch64Reconfigure() { + assert(m_reg_info_sp); - if (vg_reg_value == fail_value || vg_reg_value > 32) - return false; - - reg_info = m_reg_info_sp->GetRegisterInfo("p0"); - // Predicate registers have 1 bit per byte in the vector so their size is - // VL / 8. VG is in units of 8 bytes already, so if the size of p0 == VG - // already, we do not have to reconfigure. - if (!reg_info || vg_reg_value == reg_info->byte_size) - return false; + // Once we start to reconfigure registers, we cannot read any of them. + // So we must read VG and SVG up front. - m_reg_info_sp->UpdateARM64SVERegistersInfos(vg_reg_value); - // Make a heap based buffer that is big enough to store all registers - m_reg_data.SetData(std::make_shared( - m_reg_info_sp->GetRegisterDataByteSize(), 0)); - m_reg_data.SetByteOrder(GetByteOrder()); + const uint64_t fail_value = LLDB_INVALID_ADDRESS; + std::optional vg_reg_value; + const RegisterInfo *vg_reg_info = m_reg_info_sp->GetRegisterInfo("vg"); + if (vg_reg_info) { + uint32_t vg_reg_num = vg_reg_info->kinds[eRegisterKindLLDB]; + uint64_t reg_value = ReadRegisterAsUnsigned(vg_reg_num, fail_value); + if (reg_value != fail_value && reg_value <= 32) + vg_reg_value = reg_value; + } - InvalidateAllRegisters(); + std::optional svg_reg_value; + const RegisterInfo *svg_reg_info = m_reg_info_sp->GetRegisterInfo("svg"); + if (svg_reg_info) { + uint32_t svg_reg_num = svg_reg_info->kinds[eRegisterKindLLDB]; + uint64_t reg_value = ReadRegisterAsUnsigned(svg_reg_num, fail_value); + if (reg_value != fail_value && reg_value <= 32) + svg_reg_value = reg_value; + } - return true; + if (vg_reg_value) + m_reg_info_sp->UpdateARM64SVERegistersInfos(*vg_reg_value); + if (svg_reg_value) + m_reg_info_sp->UpdateARM64SMERegistersInfos(*svg_reg_value); + + // At this point if we have updated any registers, their offsets will all be + // invalid. If we did, we need to update them all. + if (vg_reg_value || svg_reg_value) { + m_reg_info_sp->ConfigureOffsets(); + // From here we are able to read registers again. + + // Make a heap based buffer that is big enough to store all registers + m_reg_data.SetData(std::make_shared( + m_reg_info_sp->GetRegisterDataByteSize(), 0)); + m_reg_data.SetByteOrder(GetByteOrder()); + } } void GDBRemoteDynamicRegisterInfo::UpdateARM64SVERegistersInfos(uint64_t vg) { @@ -811,7 +823,14 @@ void GDBRemoteDynamicRegisterInfo::UpdateARM64SVERegistersInfos(uint64_t vg) { } reg.byte_offset = LLDB_INVALID_INDEX32; } +} - // Re-calculate register offsets - ConfigureOffsets(); +void GDBRemoteDynamicRegisterInfo::UpdateARM64SMERegistersInfos(uint64_t svg) { + for (auto ® : m_regs) { + if (strcmp(reg.name, "za") == 0) { + // ZA is a register with size (svg*8) * (svg*8). A square essentially. + reg.byte_size = (svg * 8) * (svg * 8); + } + reg.byte_offset = LLDB_INVALID_INDEX32; + } } diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.h index a4fa18327047c..1b6b7e3ce1cb2 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.h +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.h @@ -39,6 +39,7 @@ class GDBRemoteDynamicRegisterInfo final : public DynamicRegisterInfo { ~GDBRemoteDynamicRegisterInfo() override = default; void UpdateARM64SVERegistersInfos(uint64_t vg); + void UpdateARM64SMERegistersInfos(uint64_t svg); }; class GDBRemoteRegisterContext : public RegisterContext { @@ -77,7 +78,8 @@ class GDBRemoteRegisterContext : public RegisterContext { uint32_t ConvertRegisterKindToRegisterNumber(lldb::RegisterKind kind, uint32_t num) override; - bool AArch64SVEReconfigure(); + // Reconfigure variable sized registers for AArch64 SVE and SME. + void AArch64Reconfigure(); protected: friend class ThreadGDBRemote; diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 2fc446b1dd7ab..4d489f1ac947d 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -1660,17 +1660,18 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo( gdb_thread->PrivateSetRegisterValue(lldb_regnum, buffer_sp->GetData()); } - // AArch64 SVE specific code below calls AArch64SVEReconfigure to update - // SVE register sizes and offsets if value of VG register has changed - // since last stop. + // AArch64 SVE/SME specific code below updates SVE and ZA register sizes and + // offsets if value of VG or SVG registers has changed since last stop. const ArchSpec &arch = GetTarget().GetArchitecture(); if (arch.IsValid() && arch.GetTriple().isAArch64()) { GDBRemoteRegisterContext *reg_ctx_sp = static_cast( gdb_thread->GetRegisterContext().get()); - if (reg_ctx_sp) - reg_ctx_sp->AArch64SVEReconfigure(); + if (reg_ctx_sp) { + reg_ctx_sp->AArch64Reconfigure(); + reg_ctx_sp->InvalidateAllRegisters(); + } } thread_sp->SetName(thread_name.empty() ? nullptr : thread_name.c_str()); diff --git a/lldb/source/Target/DynamicRegisterInfo.cpp b/lldb/source/Target/DynamicRegisterInfo.cpp index f1f45e52918d3..0372fd48feae6 100644 --- a/lldb/source/Target/DynamicRegisterInfo.cpp +++ b/lldb/source/Target/DynamicRegisterInfo.cpp @@ -614,10 +614,11 @@ void DynamicRegisterInfo::Finalize(const ArchSpec &arch) { ConfigureOffsets(); // Check if register info is reconfigurable - // AArch64 SVE register set has configurable register sizes + // AArch64 SVE register set has configurable register sizes, as does the ZA + // register that SME added (the streaming state of SME reuses the SVE state). if (arch.GetTriple().isAArch64()) { for (const auto ® : m_regs) { - if (strcmp(reg.name, "vg") == 0) { + if ((strcmp(reg.name, "vg") == 0) || (strcmp(reg.name, "svg") == 0)) { m_is_reconfigurable = true; break; } diff --git a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py index d3f53f0e95dfc..4f4da2b5223fb 100644 --- a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py +++ b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py @@ -70,15 +70,13 @@ def sve_regs_read_dynamic(self, sve_registers): self.runCmd("register write ffr " + "'" + p_regs_value + "'") self.expect("register read ffr", substrs=[p_regs_value]) - @no_debug_info_test - @skipIf(archs=no_match(["aarch64"])) - @skipIf(oslist=no_match(["linux"])) - def test_aarch64_dynamic_regset_config(self): - """Test AArch64 Dynamic Register sets configuration.""" + def setup_register_config_test(self, run_args=None): self.build() self.line = line_number("main.c", "// Set a break point here.") exe = self.getBuildArtifact("a.out") + if run_args is not None: + self.runCmd("settings set target.run-args " + run_args) self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) lldbutil.run_break_set_by_file_and_line( @@ -92,12 +90,16 @@ def test_aarch64_dynamic_regset_config(self): substrs=["stop reason = breakpoint 1."], ) - target = self.dbg.GetSelectedTarget() - process = target.GetProcess() - thread = process.GetThreadAtIndex(0) - currentFrame = thread.GetFrameAtIndex(0) + return self.thread().GetSelectedFrame().GetRegisters() + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_aarch64_dynamic_regset_config(self): + """Test AArch64 Dynamic Register sets configuration.""" + register_sets = self.setup_register_config_test() - for registerSet in currentFrame.GetRegisters(): + for registerSet in register_sets: if "Scalable Vector Extension Registers" in registerSet.GetName(): self.assertTrue( self.isAArch64SVE(), @@ -120,6 +122,19 @@ def test_aarch64_dynamic_regset_config(self): ) self.expect("register read data_mask", substrs=["data_mask = 0x"]) self.expect("register read code_mask", substrs=["code_mask = 0x"]) + if "Scalable Matrix Extension Registers" in registerSet.GetName(): + self.assertTrue( + self.isAArch64SME(), + "LLDB Enabled SME register set when it was disabled by target", + ) + + def make_za_value(self, vl, generator): + # Generate a vector value string "{0x00 0x01....}". + rows = [] + for row in range(vl): + byte = "0x{:02x}".format(generator(row)) + rows.append(" ".join([byte] * vl)) + return "{" + " ".join(rows) + "}" @no_debug_info_test @skipIf(archs=no_match(["aarch64"])) @@ -130,28 +145,58 @@ def test_aarch64_dynamic_regset_config_sme(self): if not self.isAArch64SME(): self.skipTest("SME must be present.") - self.build() - self.line = line_number("main.c", "// Set a break point here.") + register_sets = self.setup_register_config_test("sme") - exe = self.getBuildArtifact("a.out") - self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) - - lldbutil.run_break_set_by_file_and_line( - self, "main.c", self.line, num_expected_locations=1 + ssve_registers = register_sets.GetFirstValueByName( + "Scalable Vector Extension Registers" ) - self.runCmd("settings set target.run-args sme") - self.runCmd("run", RUN_SUCCEEDED) + self.assertTrue(ssve_registers.IsValid()) + self.sve_regs_read_dynamic(ssve_registers) - self.expect( - "thread backtrace", - STOPPED_DUE_TO_BREAKPOINT, - substrs=["stop reason = breakpoint 1."], + sme_registers = register_sets.GetFirstValueByName( + "Scalable Matrix Extension Registers" ) + self.assertTrue(sme_registers.IsValid()) - register_sets = self.thread().GetSelectedFrame().GetRegisters() + vg = ssve_registers.GetChildMemberWithName("vg").GetValueAsUnsigned() + vl = vg * 8 + # When first enabled it is all 0s. + self.expect("register read za", substrs=[self.make_za_value(vl, lambda r: 0)]) + za_value = self.make_za_value(vl, lambda r: r + 1) + self.runCmd("register write za '{}'".format(za_value)) + self.expect("register read za", substrs=[za_value]) - ssve_registers = register_sets.GetFirstValueByName( - "Scalable Vector Extension Registers" + # SVG should match VG because we're in streaming mode. + + self.assertTrue(sme_registers.IsValid()) + svg = sme_registers.GetChildMemberWithName("svg").GetValueAsUnsigned() + self.assertEqual(vg, svg) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_aarch64_dynamic_regset_config_sme_za_disabled(self): + """Test that ZA shows as 0s when disabled and can be enabled by writing + to it.""" + if not self.isAArch64SME(): + self.skipTest("SME must be present.") + + # No argument, so ZA will be disabled when we break. + register_sets = self.setup_register_config_test() + + # vg is the non-streaming vg as we are in non-streaming mode, so we need + # to use svg. + sme_registers = register_sets.GetFirstValueByName( + "Scalable Matrix Extension Registers" ) - self.assertTrue(ssve_registers.IsValid()) - self.sve_regs_read_dynamic(ssve_registers) + self.assertTrue(sme_registers.IsValid()) + svg = sme_registers.GetChildMemberWithName("svg").GetValueAsUnsigned() + + svl = svg * 8 + # A disabled ZA is shown as all 0s. + self.expect("register read za", substrs=[self.make_za_value(svl, lambda r: 0)]) + za_value = self.make_za_value(svl, lambda r: r + 1) + # Writing to it enables ZA, so the value should be there when we read + # it back. + self.runCmd("register write za '{}'".format(za_value)) + self.expect("register read za", substrs=[za_value]) diff --git a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/main.c b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/main.c index b8db8852b8ed5..93100f01502f8 100644 --- a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/main.c +++ b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/main.c @@ -1,5 +1,9 @@ #include +// If this program receives 0 arguments, it will use non-streaming SVE +// registers. If the number of arguments is >= 1, it will use streaming SVE +// registers. + #ifndef HWCAP2_SME #define HWCAP2_SME (1 << 23) #endif diff --git a/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/TestSVEThreadedDynamic.py b/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/TestSVEThreadedDynamic.py index ecac371267497..8bcb76776459d 100644 --- a/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/TestSVEThreadedDynamic.py +++ b/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/TestSVEThreadedDynamic.py @@ -98,6 +98,12 @@ def check_sve_registers(self, vg_test_value): self.expect("register read ffr", substrs=[p_regs_value]) + def build_for_mode(self, mode): + cflags = "-march=armv8-a+sve -lpthread" + if mode == Mode.SSVE: + cflags += " -DUSE_SSVE" + self.build(dictionary={"CFLAGS_EXTRAS": cflags}) + def run_sve_test(self, mode): if (mode == Mode.SVE) and not self.isAArch64SVE(): self.skipTest("SVE registers must be supported.") @@ -105,12 +111,8 @@ def run_sve_test(self, mode): if (mode == Mode.SSVE) and not self.isAArch64SME(): self.skipTest("Streaming SVE registers must be supported.") - cflags = "-march=armv8-a+sve -lpthread" - if mode == Mode.SSVE: - cflags += " -DUSE_SSVE" - self.build(dictionary={"CFLAGS_EXTRAS": cflags}) + self.build_for_mode(mode) - self.build() supported_vg = self.get_supported_vg() if not (2 in supported_vg and 4 in supported_vg): @@ -196,3 +198,94 @@ def test_sve_registers_dynamic_config(self): def test_ssve_registers_dynamic_config(self): """Test AArch64 SSVE registers multi-threaded dynamic resize.""" self.run_sve_test(Mode.SSVE) + + def setup_svg_test(self, mode): + # Even when running in SVE mode, we need access to SVG for these tests. + if not self.isAArch64SME(): + self.skipTest("Streaming SVE registers must be present.") + + self.build_for_mode(mode) + + supported_vg = self.get_supported_vg() + + main_thread_stop_line = line_number("main.c", "// Break in main thread") + lldbutil.run_break_set_by_file_and_line(self, "main.c", main_thread_stop_line) + + self.runCmd("run", RUN_SUCCEEDED) + + self.expect( + "thread info 1", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stop reason = breakpoint"], + ) + + target = self.dbg.GetSelectedTarget() + process = target.GetProcess() + + return process, supported_vg + + def read_reg(self, process, regset, reg): + registerSets = process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetRegisters() + sve_registers = registerSets.GetFirstValueByName(regset) + return sve_registers.GetChildMemberWithName(reg).GetValueAsUnsigned() + + def read_vg(self, process): + return self.read_reg(process, "Scalable Vector Extension Registers", "vg") + + def read_svg(self, process): + return self.read_reg(process, "Scalable Matrix Extension Registers", "svg") + + def do_svg_test(self, process, vgs, expected_svgs): + for vg, svg in zip(vgs, expected_svgs): + self.runCmd("register write vg {}".format(vg)) + self.assertEqual(svg, self.read_svg(process)) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_svg_sve_mode(self): + """When in SVE mode, svg should remain constant as we change vg.""" + process, supported_vg = self.setup_svg_test(Mode.SVE) + svg = self.read_svg(process) + self.do_svg_test(process, supported_vg, [svg] * len(supported_vg)) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_svg_ssve_mode(self): + """When in SSVE mode, changing vg should change svg to the same value.""" + process, supported_vg = self.setup_svg_test(Mode.SSVE) + self.do_svg_test(process, supported_vg, supported_vg) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_sme_not_present(self): + """When there is no SME, we should not show the SME register sets.""" + if self.isAArch64SME(): + self.skipTest("Streaming SVE registers must not be present.") + + self.build_for_mode(Mode.SVE) + + exe = self.getBuildArtifact("a.out") + self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) + + # This test may run on a non-sve system, but we'll stop before any + # SVE instruction would be run. + self.runCmd("b main") + self.runCmd("run", RUN_SUCCEEDED) + + self.expect( + "thread info 1", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stop reason = breakpoint"], + ) + + target = self.dbg.GetSelectedTarget() + process = target.GetProcess() + + registerSets = process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetRegisters() + sme_registers = registerSets.GetFirstValueByName( + "Scalable Matrix Extension Registers" + ) + self.assertFalse(sme_registers.IsValid()) diff --git a/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/main.c b/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/main.c index 0bb6b3b57046f..d126f8d75fb0c 100644 --- a/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/main.c +++ b/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_dynamic_resize/main.c @@ -2,6 +2,9 @@ #include #include +// If USE_SSVE is defined, this program will use streaming mode SVE registers +// instead of non-streaming mode SVE registers. + #ifndef PR_SME_SET_VL #define PR_SME_SET_VL 63 #endif diff --git a/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_static_config/main.c b/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_static_config/main.c index b4623fe12725a..b07fb62ed85b7 100644 --- a/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_static_config/main.c +++ b/lldb/test/API/commands/register/register/aarch64_sve_registers/rw_access_static_config/main.c @@ -1,6 +1,10 @@ #include #include +// If START_SSVE is defined, this program will start in streaming SVE mode +// (it will later enter and exit streaming mode a few times). Otherwise, it +// will start in non-streaming SVE mode. + #ifndef PR_SME_SET_VL #define PR_SME_SET_VL 63 #endif diff --git a/lldb/test/API/commands/register/register/aarch64_sve_simd_registers/main.c b/lldb/test/API/commands/register/register/aarch64_sve_simd_registers/main.c index 2156f094cab5c..0a05a9a5a59de 100644 --- a/lldb/test/API/commands/register/register/aarch64_sve_simd_registers/main.c +++ b/lldb/test/API/commands/register/register/aarch64_sve_simd_registers/main.c @@ -1,6 +1,16 @@ #include #include +// If SSVE is defined, this program will start in streaming SVE mode. Otherwise, +// if SVE is defined, it will start in non-streaming mode and activate the SVE +// registers by writing to one of them. If neither SSVE or SVE are defined, +// the program will start in non-streaming mode, with the SVE registers +// inactive. +// +// For most programs the difference between inactive non-streaming SVE and +// active is transparent. For lldb, there are some differences in how we use +// ptrace in either scenario. + // base is added to each value. If base = 2, then v0 = 2, v1 = 3, etc. void write_simd_regs(unsigned base) { #define WRITE_SIMD(NUM) \ diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/Makefile b/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/Makefile new file mode 100644 index 0000000000000..57d926b37d45c --- /dev/null +++ b/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/Makefile @@ -0,0 +1,5 @@ +C_SOURCES := main.c + +CFLAGS_EXTRAS := -march=armv8-a+sve+sme -lpthread + +include Makefile.rules diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/TestZAThreadedDynamic.py b/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/TestZAThreadedDynamic.py new file mode 100644 index 0000000000000..65d1071c26b2a --- /dev/null +++ b/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/TestZAThreadedDynamic.py @@ -0,0 +1,165 @@ +""" +Test the AArch64 SME Array Storage (ZA) register dynamic resize with +multiple threads. +""" + +from enum import Enum +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class AArch64ZAThreadedTestCase(TestBase): + def get_supported_vg(self): + exe = self.getBuildArtifact("a.out") + self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) + + main_thread_stop_line = line_number("main.c", "// Break in main thread") + lldbutil.run_break_set_by_file_and_line(self, "main.c", main_thread_stop_line) + + self.runCmd("settings set target.run-args 0") + self.runCmd("run", RUN_SUCCEEDED) + + self.expect( + "thread info 1", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stop reason = breakpoint"], + ) + + current_vg = self.match("register read vg", ["(0x[0-9]+)"]) + self.assertTrue(current_vg is not None) + self.expect("register write vg {}".format(current_vg.group())) + + # Aka 128, 256 and 512 bit. + supported_vg = [] + for vg in [2, 4, 8]: + # This could mask other errors but writing vg is tested elsewhere + # so we assume the hardware rejected the value. + self.runCmd("register write vg {}".format(vg), check=False) + if not self.res.GetError(): + supported_vg.append(vg) + + self.runCmd("breakpoint delete 1") + self.runCmd("continue") + + return supported_vg + + def gen_za_value(self, svg, value_generator): + svl = svg * 8 + + rows = [] + for row in range(svl): + byte = "0x{:02x}".format(value_generator(row)) + rows.append(" ".join([byte] * svl)) + + return "{" + " ".join(rows) + "}" + + def check_za_register(self, svg, value_offset): + self.expect( + "register read za", + substrs=[self.gen_za_value(svg, lambda r: r + value_offset)], + ) + + def check_disabled_za_register(self, svg): + self.expect("register read za", substrs=[self.gen_za_value(svg, lambda r: 0)]) + + def za_test_impl(self, enable_za): + if not self.isAArch64SME(): + self.skipTest("SME must be present.") + + self.build() + supported_vg = self.get_supported_vg() + + self.runCmd("settings set target.run-args {}".format("1" if enable_za else "0")) + + if not (2 in supported_vg and 4 in supported_vg): + self.skipTest("Not all required streaming vector lengths are supported.") + + main_thread_stop_line = line_number("main.c", "// Break in main thread") + lldbutil.run_break_set_by_file_and_line(self, "main.c", main_thread_stop_line) + + thX_break_line1 = line_number("main.c", "// Thread X breakpoint 1") + lldbutil.run_break_set_by_file_and_line(self, "main.c", thX_break_line1) + + thX_break_line2 = line_number("main.c", "// Thread X breakpoint 2") + lldbutil.run_break_set_by_file_and_line(self, "main.c", thX_break_line2) + + thY_break_line1 = line_number("main.c", "// Thread Y breakpoint 1") + lldbutil.run_break_set_by_file_and_line(self, "main.c", thY_break_line1) + + thY_break_line2 = line_number("main.c", "// Thread Y breakpoint 2") + lldbutil.run_break_set_by_file_and_line(self, "main.c", thY_break_line2) + + self.runCmd("run", RUN_SUCCEEDED) + + self.expect( + "thread info 1", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stop reason = breakpoint"], + ) + + if 8 in supported_vg: + if enable_za: + self.check_za_register(8, 1) + else: + self.check_disabled_za_register(8) + else: + if enable_za: + self.check_za_register(4, 1) + else: + self.check_disabled_za_register(4) + + self.runCmd("process continue", RUN_SUCCEEDED) + + process = self.dbg.GetSelectedTarget().GetProcess() + for idx in range(1, process.GetNumThreads()): + thread = process.GetThreadAtIndex(idx) + if thread.GetStopReason() != lldb.eStopReasonBreakpoint: + self.runCmd("thread continue %d" % (idx + 1)) + self.assertEqual(thread.GetStopReason(), lldb.eStopReasonBreakpoint) + + stopped_at_line_number = thread.GetFrameAtIndex(0).GetLineEntry().GetLine() + + if stopped_at_line_number == thX_break_line1: + self.runCmd("thread select %d" % (idx + 1)) + self.check_za_register(4, 2) + self.runCmd("register write vg 2") + + elif stopped_at_line_number == thY_break_line1: + self.runCmd("thread select %d" % (idx + 1)) + self.check_za_register(2, 3) + self.runCmd("register write vg 4") + + self.runCmd("thread continue 2") + self.runCmd("thread continue 3") + + for idx in range(1, process.GetNumThreads()): + thread = process.GetThreadAtIndex(idx) + self.assertEqual(thread.GetStopReason(), lldb.eStopReasonBreakpoint) + + stopped_at_line_number = thread.GetFrameAtIndex(0).GetLineEntry().GetLine() + + if stopped_at_line_number == thX_break_line2: + self.runCmd("thread select %d" % (idx + 1)) + self.check_za_register(2, 2) + + elif stopped_at_line_number == thY_break_line2: + self.runCmd("thread select %d" % (idx + 1)) + self.check_za_register(4, 3) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_register_dynamic_config_main_enabled(self): + """Test multiple threads resizing ZA, with the main thread's ZA + enabled.""" + self.za_test_impl(True) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_register_dynamic_config_main_disabled(self): + """Test multiple threads resizing ZA, with the main thread's ZA + disabled.""" + self.za_test_impl(False) diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/main.c b/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/main.c new file mode 100644 index 0000000000000..fd2590dbe411f --- /dev/null +++ b/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/main.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include + +// Important notes for this test: +// * Making a syscall will disable streaming mode. +// * LLDB writing to vg while in streaming mode will disable ZA +// (this is just how ptrace works). +// * Using an instruction to write to an inactive ZA produces a SIGILL +// (doing the same thing via ptrace does not, as the kernel activates ZA for +// us in that case). + +#ifndef PR_SME_SET_VL +#define PR_SME_SET_VL 63 +#endif + +#define SM_INST(c) asm volatile("msr s0_3_c4_c" #c "_3, xzr") +#define SMSTART_SM SM_INST(3) +#define SMSTART_ZA SM_INST(5) + +void set_za_register(int svl, int value_offset) { +#define MAX_VL_BYTES 256 + uint8_t data[MAX_VL_BYTES]; + + // ldr za will actually wrap the selected vector row, by the number of rows + // you have. So setting one that didn't exist would actually set one that did. + // That's why we need the streaming vector length here. + for (int i = 0; i < svl; ++i) { + memset(data, i + value_offset, MAX_VL_BYTES); + // Each one of these loads a VL sized row of ZA. + asm volatile("mov w12, %w0\n\t" + "ldr za[w12, 0], [%1]\n\t" ::"r"(i), + "r"(&data) + : "w12"); + } +} + +// These are used to make sure we only break in each thread once both of the +// threads have been started. Otherwise when the test does "process continue" +// it could stop in one thread and wait forever for the other one to start. +atomic_bool threadX_ready = false; +atomic_bool threadY_ready = false; + +void *threadX_func(void *x_arg) { + threadX_ready = true; + while (!threadY_ready) { + } + + prctl(PR_SME_SET_VL, 8 * 4); + SMSTART_SM; + SMSTART_ZA; + set_za_register(8 * 4, 2); + SMSTART_ZA; // Thread X breakpoint 1 + set_za_register(8 * 2, 2); + return NULL; // Thread X breakpoint 2 +} + +void *threadY_func(void *y_arg) { + threadY_ready = true; + while (!threadX_ready) { + } + + prctl(PR_SME_SET_VL, 8 * 2); + SMSTART_SM; + SMSTART_ZA; + set_za_register(8 * 2, 3); + SMSTART_ZA; // Thread Y breakpoint 1 + set_za_register(8 * 4, 3); + return NULL; // Thread Y breakpoint 2 +} + +int main(int argc, char *argv[]) { + // Expecting argument to tell us whether to enable ZA on the main thread. + if (argc != 2) + return 1; + + prctl(PR_SME_SET_VL, 8 * 8); + SMSTART_SM; + + if (argv[1][0] == '1') { + SMSTART_ZA; + set_za_register(8 * 8, 1); + } + // else we do not enable ZA and lldb will show 0s for it. + + pthread_t x_thread; + if (pthread_create(&x_thread, NULL, threadX_func, 0)) // Break in main thread + return 1; + + pthread_t y_thread; + if (pthread_create(&y_thread, NULL, threadY_func, 0)) + return 1; + + if (pthread_join(x_thread, NULL)) + return 2; + + if (pthread_join(y_thread, NULL)) + return 2; + + return 0; +} diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/Makefile b/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/Makefile new file mode 100644 index 0000000000000..f2ca08f3531aa --- /dev/null +++ b/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/Makefile @@ -0,0 +1,5 @@ +C_SOURCES := main.c + +CFLAGS_EXTRAS := -march=armv8-a+sve+sme + +include Makefile.rules diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/TestZARegisterSaveRestore.py b/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/TestZARegisterSaveRestore.py new file mode 100644 index 0000000000000..1d4bbd6207a51 --- /dev/null +++ b/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/TestZARegisterSaveRestore.py @@ -0,0 +1,252 @@ +""" +Test the AArch64 SME ZA register is saved and restored around expressions. + +This attempts to cover expressions that change the following: +* ZA enabled or not. +* Streaming mode or not. +* Streaming vector length (increasing and decreasing). +* Some combintations of the above. +""" + +from enum import IntEnum +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +# These enum values match the flag values used in the test program. +class Mode(IntEnum): + SVE = 0 + SSVE = 1 + + +class ZA(IntEnum): + Disabled = 0 + Enabled = 1 + + +class AArch64ZATestCase(TestBase): + def get_supported_svg(self): + # Always build this probe program to start as streaming SVE. + # We will read/write "vg" here but since we are in streaming mode "svg" + # is really what we are writing ("svg" is a read only pseudo). + self.build() + + exe = self.getBuildArtifact("a.out") + self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) + # Enter streaming mode, don't enable ZA, start_vl and other_vl don't + # matter here. + self.runCmd("settings set target.run-args 1 0 0 0") + + stop_line = line_number("main.c", "// Set a break point here.") + lldbutil.run_break_set_by_file_and_line( + self, "main.c", stop_line, num_expected_locations=1 + ) + + self.runCmd("run", RUN_SUCCEEDED) + + self.expect( + "thread info 1", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stop reason = breakpoint"], + ) + + # Write back the current vg to confirm read/write works at all. + current_svg = self.match("register read vg", ["(0x[0-9]+)"]) + self.assertTrue(current_svg is not None) + self.expect("register write vg {}".format(current_svg.group())) + + # Aka 128, 256 and 512 bit. + supported_svg = [] + for svg in [2, 4, 8]: + # This could mask other errors but writing vg is tested elsewhere + # so we assume the hardware rejected the value. + self.runCmd("register write vg {}".format(svg), check=False) + if not self.res.GetError(): + supported_svg.append(svg) + + self.runCmd("breakpoint delete 1") + self.runCmd("continue") + + return supported_svg + + def read_vg(self): + process = self.dbg.GetSelectedTarget().GetProcess() + registerSets = process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetRegisters() + sve_registers = registerSets.GetFirstValueByName( + "Scalable Vector Extension Registers" + ) + return sve_registers.GetChildMemberWithName("vg").GetValueAsUnsigned() + + def read_svg(self): + process = self.dbg.GetSelectedTarget().GetProcess() + registerSets = process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetRegisters() + sve_registers = registerSets.GetFirstValueByName( + "Scalable Matrix Extension Registers" + ) + return sve_registers.GetChildMemberWithName("svg").GetValueAsUnsigned() + + def make_za_value(self, vl, generator): + # Generate a vector value string "{0x00 0x01....}". + rows = [] + for row in range(vl): + byte = "0x{:02x}".format(generator(row)) + rows.append(" ".join([byte] * vl)) + return "{" + " ".join(rows) + "}" + + def check_za(self, vl): + # We expect an increasing value starting at 1. Row 0=1, row 1 = 2, etc. + self.expect( + "register read za", substrs=[self.make_za_value(vl, lambda row: row + 1)] + ) + + def check_za_disabled(self, vl): + # When ZA is disabled, lldb will show ZA as all 0s. + self.expect("register read za", substrs=[self.make_za_value(vl, lambda row: 0)]) + + def za_expr_test_impl(self, sve_mode, za_state, swap_start_vl): + if not self.isAArch64SME(): + self.skipTest("SME must be present.") + + supported_svg = self.get_supported_svg() + if len(supported_svg) < 2: + self.skipTest("Target must support at least 2 streaming vector lengths.") + + # vg is in units of 8 bytes. + start_vl = supported_svg[1] * 8 + other_vl = supported_svg[2] * 8 + + if swap_start_vl: + start_vl, other_vl = other_vl, start_vl + + self.line = line_number("main.c", "// Set a break point here.") + + exe = self.getBuildArtifact("a.out") + self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) + self.runCmd( + "settings set target.run-args {} {} {} {}".format( + sve_mode, za_state, start_vl, other_vl + ) + ) + + lldbutil.run_break_set_by_file_and_line( + self, "main.c", self.line, num_expected_locations=1 + ) + self.runCmd("run", RUN_SUCCEEDED) + + self.expect( + "thread backtrace", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stop reason = breakpoint 1."], + ) + + exprs = [ + "expr_disable_za", + "expr_enable_za", + "expr_start_vl", + "expr_other_vl", + "expr_enable_sm", + "expr_disable_sm", + ] + + # This may be the streaming or non-streaming vg. All that matters is + # that it is saved and restored, remaining constant throughout. + start_vg = self.read_vg() + + # Check SVE registers to make sure that combination of scaling SVE + # and scaling ZA works properly. This is a brittle check, but failures + # are likely to be catastrophic when they do happen anyway. + sve_reg_names = "ffr {} {}".format( + " ".join(["z{}".format(n) for n in range(32)]), + " ".join(["p{}".format(n) for n in range(16)]), + ) + self.runCmd("register read " + sve_reg_names) + sve_values = self.res.GetOutput() + + def check_regs(): + if za_state == ZA.Enabled: + self.check_za(start_vl) + else: + self.check_za_disabled(start_vl) + + # svg and vg are in units of 8 bytes. + self.assertEqual(start_vl, self.read_svg() * 8) + self.assertEqual(start_vg, self.read_vg()) + + self.expect("register read " + sve_reg_names, substrs=[sve_values]) + + for expr in exprs: + expr_cmd = "expression {}()".format(expr) + + # We do this twice because there were issues in development where + # using data stored by a previous WriteAllRegisterValues would crash + # the second time around. + self.runCmd(expr_cmd) + check_regs() + self.runCmd(expr_cmd) + check_regs() + + # Run them in sequence to make sure there is no state lingering between + # them after a restore. + for expr in exprs: + self.runCmd("expression {}()".format(expr)) + check_regs() + + for expr in reversed(exprs): + self.runCmd("expression {}()".format(expr)) + check_regs() + + # These tests start with the 1st supported SVL and change to the 2nd + # supported SVL as needed. + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_expr_ssve_za_enabled(self): + self.za_expr_test_impl(Mode.SSVE, ZA.Enabled, False) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_expr_ssve_za_disabled(self): + self.za_expr_test_impl(Mode.SSVE, ZA.Disabled, False) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_expr_sve_za_enabled(self): + self.za_expr_test_impl(Mode.SVE, ZA.Enabled, False) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_expr_sve_za_disabled(self): + self.za_expr_test_impl(Mode.SVE, ZA.Disabled, False) + + # These tests start in the 2nd supported SVL and change to the 1st supported + # SVL as needed. + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_expr_ssve_za_enabled_different_vl(self): + self.za_expr_test_impl(Mode.SSVE, ZA.Enabled, True) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_expr_ssve_za_disabled_different_vl(self): + self.za_expr_test_impl(Mode.SSVE, ZA.Disabled, True) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_expr_sve_za_enabled_different_vl(self): + self.za_expr_test_impl(Mode.SVE, ZA.Enabled, True) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_za_expr_sve_za_disabled_different_vl(self): + self.za_expr_test_impl(Mode.SVE, ZA.Disabled, True) diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/main.c b/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/main.c new file mode 100644 index 0000000000000..a8434787a5a12 --- /dev/null +++ b/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/main.c @@ -0,0 +1,225 @@ +#include +#include +#include +#include +#include + +// Important details for this program: +// * Making a syscall will disable streaming mode if it is active. +// * Changing the vector length will make streaming mode and ZA inactive. +// * ZA can be active independent of streaming mode. +// * ZA's size is the streaming vector length squared. + +#ifndef PR_SME_SET_VL +#define PR_SME_SET_VL 63 +#endif + +#ifndef PR_SME_GET_VL +#define PR_SME_GET_VL 64 +#endif + +#ifndef PR_SME_VL_LEN_MASK +#define PR_SME_VL_LEN_MASK 0xffff +#endif + +#define SM_INST(c) asm volatile("msr s0_3_c4_c" #c "_3, xzr") +#define SMSTART SM_INST(7) +#define SMSTART_SM SM_INST(3) +#define SMSTART_ZA SM_INST(5) +#define SMSTOP SM_INST(6) +#define SMSTOP_SM SM_INST(2) +#define SMSTOP_ZA SM_INST(4) + +int start_vl = 0; +int other_vl = 0; + +void write_sve_regs() { + // We assume the smefa64 feature is present, which allows ffr access + // in streaming mode. + asm volatile("setffr\n\t"); + asm volatile("ptrue p0.b\n\t"); + asm volatile("ptrue p1.h\n\t"); + asm volatile("ptrue p2.s\n\t"); + asm volatile("ptrue p3.d\n\t"); + asm volatile("pfalse p4.b\n\t"); + asm volatile("ptrue p5.b\n\t"); + asm volatile("ptrue p6.h\n\t"); + asm volatile("ptrue p7.s\n\t"); + asm volatile("ptrue p8.d\n\t"); + asm volatile("pfalse p9.b\n\t"); + asm volatile("ptrue p10.b\n\t"); + asm volatile("ptrue p11.h\n\t"); + asm volatile("ptrue p12.s\n\t"); + asm volatile("ptrue p13.d\n\t"); + asm volatile("pfalse p14.b\n\t"); + asm volatile("ptrue p15.b\n\t"); + + asm volatile("cpy z0.b, p0/z, #1\n\t"); + asm volatile("cpy z1.b, p5/z, #2\n\t"); + asm volatile("cpy z2.b, p10/z, #3\n\t"); + asm volatile("cpy z3.b, p15/z, #4\n\t"); + asm volatile("cpy z4.b, p0/z, #5\n\t"); + asm volatile("cpy z5.b, p5/z, #6\n\t"); + asm volatile("cpy z6.b, p10/z, #7\n\t"); + asm volatile("cpy z7.b, p15/z, #8\n\t"); + asm volatile("cpy z8.b, p0/z, #9\n\t"); + asm volatile("cpy z9.b, p5/z, #10\n\t"); + asm volatile("cpy z10.b, p10/z, #11\n\t"); + asm volatile("cpy z11.b, p15/z, #12\n\t"); + asm volatile("cpy z12.b, p0/z, #13\n\t"); + asm volatile("cpy z13.b, p5/z, #14\n\t"); + asm volatile("cpy z14.b, p10/z, #15\n\t"); + asm volatile("cpy z15.b, p15/z, #16\n\t"); + asm volatile("cpy z16.b, p0/z, #17\n\t"); + asm volatile("cpy z17.b, p5/z, #18\n\t"); + asm volatile("cpy z18.b, p10/z, #19\n\t"); + asm volatile("cpy z19.b, p15/z, #20\n\t"); + asm volatile("cpy z20.b, p0/z, #21\n\t"); + asm volatile("cpy z21.b, p5/z, #22\n\t"); + asm volatile("cpy z22.b, p10/z, #23\n\t"); + asm volatile("cpy z23.b, p15/z, #24\n\t"); + asm volatile("cpy z24.b, p0/z, #25\n\t"); + asm volatile("cpy z25.b, p5/z, #26\n\t"); + asm volatile("cpy z26.b, p10/z, #27\n\t"); + asm volatile("cpy z27.b, p15/z, #28\n\t"); + asm volatile("cpy z28.b, p0/z, #29\n\t"); + asm volatile("cpy z29.b, p5/z, #30\n\t"); + asm volatile("cpy z30.b, p10/z, #31\n\t"); + asm volatile("cpy z31.b, p15/z, #32\n\t"); +} + +// Write something different so we will know if we didn't restore them +// correctly. +void write_sve_regs_expr() { + asm volatile("pfalse p0.b\n\t"); + asm volatile("wrffr p0.b\n\t"); + asm volatile("pfalse p1.b\n\t"); + asm volatile("pfalse p2.b\n\t"); + asm volatile("pfalse p3.b\n\t"); + asm volatile("ptrue p4.b\n\t"); + asm volatile("pfalse p5.b\n\t"); + asm volatile("pfalse p6.b\n\t"); + asm volatile("pfalse p7.b\n\t"); + asm volatile("pfalse p8.b\n\t"); + asm volatile("ptrue p9.b\n\t"); + asm volatile("pfalse p10.b\n\t"); + asm volatile("pfalse p11.b\n\t"); + asm volatile("pfalse p12.b\n\t"); + asm volatile("pfalse p13.b\n\t"); + asm volatile("ptrue p14.b\n\t"); + asm volatile("pfalse p15.b\n\t"); + + asm volatile("cpy z0.b, p0/z, #2\n\t"); + asm volatile("cpy z1.b, p5/z, #3\n\t"); + asm volatile("cpy z2.b, p10/z, #4\n\t"); + asm volatile("cpy z3.b, p15/z, #5\n\t"); + asm volatile("cpy z4.b, p0/z, #6\n\t"); + asm volatile("cpy z5.b, p5/z, #7\n\t"); + asm volatile("cpy z6.b, p10/z, #8\n\t"); + asm volatile("cpy z7.b, p15/z, #9\n\t"); + asm volatile("cpy z8.b, p0/z, #10\n\t"); + asm volatile("cpy z9.b, p5/z, #11\n\t"); + asm volatile("cpy z10.b, p10/z, #12\n\t"); + asm volatile("cpy z11.b, p15/z, #13\n\t"); + asm volatile("cpy z12.b, p0/z, #14\n\t"); + asm volatile("cpy z13.b, p5/z, #15\n\t"); + asm volatile("cpy z14.b, p10/z, #16\n\t"); + asm volatile("cpy z15.b, p15/z, #17\n\t"); + asm volatile("cpy z16.b, p0/z, #18\n\t"); + asm volatile("cpy z17.b, p5/z, #19\n\t"); + asm volatile("cpy z18.b, p10/z, #20\n\t"); + asm volatile("cpy z19.b, p15/z, #21\n\t"); + asm volatile("cpy z20.b, p0/z, #22\n\t"); + asm volatile("cpy z21.b, p5/z, #23\n\t"); + asm volatile("cpy z22.b, p10/z, #24\n\t"); + asm volatile("cpy z23.b, p15/z, #25\n\t"); + asm volatile("cpy z24.b, p0/z, #26\n\t"); + asm volatile("cpy z25.b, p5/z, #27\n\t"); + asm volatile("cpy z26.b, p10/z, #28\n\t"); + asm volatile("cpy z27.b, p15/z, #29\n\t"); + asm volatile("cpy z28.b, p0/z, #30\n\t"); + asm volatile("cpy z29.b, p5/z, #31\n\t"); + asm volatile("cpy z30.b, p10/z, #32\n\t"); + asm volatile("cpy z31.b, p15/z, #33\n\t"); +} + +void set_za_register(int svl, int value_offset) { +#define MAX_VL_BYTES 256 + uint8_t data[MAX_VL_BYTES]; + + // ldr za will actually wrap the selected vector row, by the number of rows + // you have. So setting one that didn't exist would actually set one that did. + // That's why we need the streaming vector length here. + for (int i = 0; i < svl; ++i) { + memset(data, i + value_offset, MAX_VL_BYTES); + // Each one of these loads a VL sized row of ZA. + asm volatile("mov w12, %w0\n\t" + "ldr za[w12, 0], [%1]\n\t" ::"r"(i), + "r"(&data) + : "w12"); + } +} + +void expr_disable_za() { + SMSTOP_ZA; + write_sve_regs_expr(); +} + +void expr_enable_za() { + SMSTART_ZA; + set_za_register(start_vl, 2); + write_sve_regs_expr(); +} + +void expr_start_vl() { + prctl(PR_SME_SET_VL, start_vl); + SMSTART_ZA; + set_za_register(start_vl, 4); + write_sve_regs_expr(); +} + +void expr_other_vl() { + prctl(PR_SME_SET_VL, other_vl); + SMSTART_ZA; + set_za_register(other_vl, 5); + write_sve_regs_expr(); +} + +void expr_enable_sm() { + SMSTART_SM; + write_sve_regs_expr(); +} + +void expr_disable_sm() { + SMSTOP_SM; + write_sve_regs_expr(); +} + +int main(int argc, char *argv[]) { + // We expect to get: + // * whether to enable streaming mode + // * whether to enable ZA + // * what the starting VL should be + // * what the other VL should be + if (argc != 5) + return 1; + + bool ssve = argv[1][0] == '1'; + bool za = argv[2][0] == '1'; + start_vl = atoi(argv[3]); + other_vl = atoi(argv[4]); + + prctl(PR_SME_SET_VL, start_vl); + + if (ssve) + SMSTART_SM; + + if (za) { + SMSTART_ZA; + set_za_register(start_vl, 1); + } + + write_sve_regs(); + + return 0; // Set a break point here. +} diff --git a/lldb/test/API/python_api/value/TestValueAPI.py b/lldb/test/API/python_api/value/TestValueAPI.py index 3855c67c8d372..75052671d8d26 100644 --- a/lldb/test/API/python_api/value/TestValueAPI.py +++ b/lldb/test/API/python_api/value/TestValueAPI.py @@ -210,7 +210,7 @@ def test(self): # Check that hex value printing works as expected. self.assertEqual( frame0.FindVariable("fixed_int_ptr").GetValue(), - "0x00000000000000aa", + "0x000000aa" if target.addr_size == 4 else "0x00000000000000aa", ) self.runCmd("settings set target.show-hex-variable-values-with-leading-zeroes false") self.assertEqual( diff --git a/lldb/unittests/Core/DumpDataExtractorTest.cpp b/lldb/unittests/Core/DumpDataExtractorTest.cpp index bbe5e9e5ed9e2..8c58fecfee29c 100644 --- a/lldb/unittests/Core/DumpDataExtractorTest.cpp +++ b/lldb/unittests/Core/DumpDataExtractorTest.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "lldb/Core/DumpDataExtractor.h" +#include "lldb/Host/FileSystem.h" +#include "lldb/Host/HostInfo.h" #include "lldb/Utility/DataBufferHeap.h" #include "lldb/Utility/DataExtractor.h" #include "lldb/Utility/Endian.h" @@ -18,6 +20,20 @@ using namespace lldb; using namespace lldb_private; +// This is needed for the tests because they rely on the Target global +// properties. +class DumpDataExtractorTest : public ::testing::Test { +public: + void SetUp() override { + FileSystem::Initialize(); + HostInfo::Initialize(); + } + void TearDown() override { + HostInfo::Terminate(); + FileSystem::Terminate(); + } +}; + static void TestDumpWithAddress(uint64_t base_addr, size_t item_count, llvm::StringRef expected) { std::vector data{0x11, 0x22}; @@ -32,7 +48,7 @@ static void TestDumpWithAddress(uint64_t base_addr, size_t item_count, ASSERT_EQ(expected, result.GetString()); } -TEST(DumpDataExtractorTest, BaseAddress) { +TEST_F(DumpDataExtractorTest, BaseAddress) { TestDumpWithAddress(0x12341234, 1, "0x12341234: 0x11"); TestDumpWithAddress(LLDB_INVALID_ADDRESS, 1, "0x11"); TestDumpWithAddress(0x12341234, 2, "0x12341234: 0x11\n0x12341235: 0x22"); @@ -53,7 +69,7 @@ static void TestDumpWithOffset(offset_t start_offset, ASSERT_EQ(expected, result.GetString()); } -TEST(DumpDataExtractorTest, StartOffset) { +TEST_F(DumpDataExtractorTest, StartOffset) { TestDumpWithOffset(0, "0x00000000: 0x11 0x22 0x33"); // The offset applies to the DataExtractor, not the address used when // formatting. @@ -62,7 +78,7 @@ TEST(DumpDataExtractorTest, StartOffset) { TestDumpWithOffset(3, ""); } -TEST(DumpDataExtractorTest, NullStream) { +TEST_F(DumpDataExtractorTest, NullStream) { // We don't do any work if there is no output stream. uint8_t c = 0x11; StreamString result; @@ -112,7 +128,7 @@ static void TestDump(const std::vector data, lldb::Format format, LLDB_INVALID_ADDRESS, format, expected); } -TEST(DumpDataExtractorTest, Formats) { +TEST_F(DumpDataExtractorTest, Formats) { TestDump(1, lldb::eFormatDefault, "0x01"); TestDump(1, lldb::eFormatBoolean, "true"); TestDump(0xAA, lldb::eFormatBinary, "0b10101010"); @@ -258,7 +274,7 @@ TEST(DumpDataExtractorTest, Formats) { TestDump(99, lldb::Format::eFormatVoid, "0x00000063"); } -TEST(DumpDataExtractorTest, FormatCharArray) { +TEST_F(DumpDataExtractorTest, FormatCharArray) { // Unlike the other formats, charArray isn't 1 array of N chars. // It must be passed as N chars of 1 byte each. // (eFormatVectorOfChar does this swap for you) @@ -299,7 +315,7 @@ void TestDumpMultiLine(const T *data, size_t num_items, lldb::Format format, 0x80000000, format, expected); } -TEST(DumpDataExtractorTest, MultiLine) { +TEST_F(DumpDataExtractorTest, MultiLine) { // A vector counts as 1 item regardless of size. TestDumpMultiLine(std::vector{0x11}, lldb::Format::eFormatVectorOfUInt8, 1, @@ -354,7 +370,7 @@ void TestDumpWithItemByteSize(size_t item_byte_size, lldb::Format format, expected); } -TEST(DumpDataExtractorTest, ItemByteSizeErrors) { +TEST_F(DumpDataExtractorTest, ItemByteSizeErrors) { TestDumpWithItemByteSize( 16, lldb::Format::eFormatBoolean, "error: unsupported byte size (16) for boolean format"); diff --git a/llvm-libgcc/CMakeLists.txt b/llvm-libgcc/CMakeLists.txt index de42d908c3711..013c9ca2e3307 100644 --- a/llvm-libgcc/CMakeLists.txt +++ b/llvm-libgcc/CMakeLists.txt @@ -1,19 +1,38 @@ -cmake_minimum_required(VERSION 3.20.0) +#=============================================================================== +# Setup Project +#=============================================================================== -if (NOT IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/../libunwind") - message(FATAL_ERROR "llvm-libgcc requires being built in a monorepo layout with libunwind available") -endif() +cmake_minimum_required(VERSION 3.20.0) set(LLVM_COMMON_CMAKE_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/../cmake") -list(APPEND CMAKE_MODULE_PATH +# Check if llvm-libgcc is built as a standalone project +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LLVM_LIBGCC_STANDALONE_BUILD) + project(llvm-libgcc LANGUAGES C CXX ASM) + set(COMPILER_RT_STANDALONE_BUILD ON) + set_property(GLOBAL PROPERTY USE_FOLDERS ON) + set(LLVM_LIBGCC_COMPILER_RT_BINARY_DIR "compiler-rt") + set(LLVM_LIBGCC_LIBUNWIND_BINARY_DIR "libunwind") +else() + set(LLVM_LIBGCC_COMPILER_RT_BINARY_DIR "../compiler-rt") + set(LLVM_LIBGCC_LIBUNWIND_BINARY_DIR "../libunwind") +endif() + +# Add path for custom modules +list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules" + "${CMAKE_CURRENT_SOURCE_DIR}/../runtimes/cmake/Modules" "${LLVM_COMMON_CMAKE_UTILS}" "${LLVM_COMMON_CMAKE_UTILS}/Modules" - ) +) + +set(LLVM_LIBGCC_LIBUNWIND_PATH "${CMAKE_CURRENT_LIST_DIR}/../libunwind" + CACHE PATH "Specify path to libunwind source.") +set(LLVM_LIBGCC_COMPILER_RT_PATH "${CMAKE_CURRENT_LIST_DIR}/../compiler-rt" + CACHE PATH "Specify path to compiler-rt source.") -project(llvm-libgcc LANGUAGES C CXX ASM) +include(GNUInstallDirs) if(NOT LLVM_LIBGCC_EXPLICIT_OPT_IN) message(FATAL_ERROR @@ -25,18 +44,114 @@ if(NOT LLVM_LIBGCC_EXPLICIT_OPT_IN) "to your CMake invocation and try again.") endif() -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib${LLVMLIB_DIR_SUFFIX}") -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib${LLVMLIB_DIR_SUFFIX}") -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") +if(HAVE_COMPILER_RT) + message(FATAL_ERROR + "Attempting to build both compiler-rt and llvm-libgcc will cause irreconcilable " + "target clashes. Please choose one or the other, but not both.") +endif() + +if(HAVE_LIBUNWIND) + message(FATAL_ERROR + "Attempting to build both libunwind and llvm-libgcc will cause irreconcilable " + "target clashes. Please choose one or the other, but not both.") +endif() + +#=============================================================================== +# Configure System +#=============================================================================== + +if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE) + set(LLVM_LIBGCC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}) + set(LLVM_LIBGCC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX}/${LLVM_DEFAULT_TARGET_TRIPLE} CACHE PATH + "Path where built llvm-libgcc libraries should be installed.") + if(LIBCXX_LIBDIR_SUBDIR) + string(APPEND LLVM_LIBGCC_LIBRARY_DIR /${LLVM_LIBGCC_LIBDIR_SUBDIR}) + string(APPEND LLVM_LIBGCC_INSTALL_LIBRARY_DIR /${LLVM_LIBGCC_LIBDIR_SUBDIR}) + endif() +else() + if(LLVM_LIBRARY_OUTPUT_INTDIR) + set(LLVM_LIBGCC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) + else() + set(LLVM_LIBGCC_LIBRARY_DIR ${CMAKE_BINARY_DIR}/lib${LLVM_LIBGCC_LIBDIR_SUFFIX}) + endif() + set(LLVM_LIBGCC_INSTALL_LIBRARY_DIR lib${LLVM_LIBGCC_LIBDIR_SUFFIX} CACHE PATH + "Path where built llvm-libgcc libraries should be installed.") +endif() + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBGCC_LIBRARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_LIBGCC_LIBRARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_LIBGCC_LIBRARY_DIR}) + +#=============================================================================== +# Build libraries +#=============================================================================== + +set(COMPILER_RT_BUILD_BUILTINS ON) +set(COMPILER_RT_BUILTINS_HIDE_SYMBOLS OFF) +add_subdirectory(${LLVM_LIBGCC_COMPILER_RT_PATH} ${LLVM_LIBGCC_COMPILER_RT_BINARY_DIR}) + +set(LIBUNWIND_ENABLE_STATIC ON) +set(LIBUNWIND_ENABLE_SHARED ON) +set(LIBUNWIND_USE_COMPILER_RT OFF) +set(LIBUNWIND_HAS_GCC_LIB OFF) +set(LIBUNWIND_HAS_GCC_S_LIB OFF) +add_subdirectory(${LLVM_LIBGCC_LIBUNWIND_PATH} ${LLVM_LIBGCC_LIBUNWIND_BINARY_DIR}) + +add_custom_target(gcc_s.ver + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/gcc_s.ver.in + COMMAND ${CMAKE_C_COMPILER} -E + -xc ${CMAKE_CURRENT_SOURCE_DIR}/gcc_s.ver.in + -o ${CMAKE_CURRENT_BINARY_DIR}/gcc_s.ver +) + +add_dependencies(unwind_shared gcc_s.ver) + +construct_compiler_rt_default_triple() + +target_link_options(unwind_shared PUBLIC + -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/gcc_s.ver +) + +target_link_libraries(unwind_shared PUBLIC + $ + m +) + +#=============================================================================== +# Install Symlinks +#=============================================================================== + +get_compiler_rt_install_dir(${COMPILER_RT_DEFAULT_TARGET_ARCH} install_dir_builtins) +string(REGEX REPLACE "^lib/" "" install_dir_builtins "${install_dir_builtins}") +string(FIND "${install_dir_builtins}" "clang" install_path_contains_triple) +if(install_path_contains_triple EQUAL -1) + set(builtins_suffix "-${COMPILER_RT_DEFAULT_TARGET_ARCH}") +else() + string(PREPEND install_dir_builtins "../") +endif() +set(LLVM_LIBGCC_COMPILER_RT ${install_dir_builtins}/libclang_rt.builtins${builtins_suffix}.a) + +add_custom_target(llvm-libgcc ALL + DEPENDS unwind_shared unwind_static clang_rt.builtins-${COMPILER_RT_DEFAULT_TARGET_ARCH} + COMMAND ${CMAKE_COMMAND} -E create_symlink ${LLVM_LIBGCC_COMPILER_RT} libgcc.a + COMMAND ${CMAKE_COMMAND} -E create_symlink libunwind.a libgcc_eh.a + COMMAND ${CMAKE_COMMAND} -E create_symlink libunwind.so libgcc_s.so.1.0 + COMMAND ${CMAKE_COMMAND} -E create_symlink libgcc_s.so.1.0 libgcc_s.so.1 + COMMAND ${CMAKE_COMMAND} -E create_symlink libgcc_s.so.1 libgcc_s.so +) -set(COMPILER_RT_BUILD_BUILTINS On) -set(COMPILER_RT_BUILTINS_HIDE_SYMBOLS Off) -add_subdirectory("../compiler-rt" "compiler-rt") +install(TARGETS unwind_shared unwind_static + LIBRARY DESTINATION ${LLVM_LIBGCC_INSTALL_LIBRARY_DIR} COMPONENT llvm-libgcc + ARCHIVE DESTINATION ${LLVM_LIBGCC_INSTALL_LIBRARY_DIR} COMPONENT llvm-libgcc + RUNTIME DESTINATION ${LLVM_LIBGCC_INSTALL_RUNTIME_DIR} COMPONENT llvm-libgcc) -set(LIBUNWIND_ENABLE_STATIC On) -set(LIBUNWIND_ENABLE_SHARED Off) -set(LIBUNWIND_HAS_COMMENT_LIB_PRAGMA Off) -set(LIBUNWIND_USE_COMPILER_RT On) -add_subdirectory("../libunwind" "libunwind") +install(TARGETS clang_rt.builtins-${COMPILER_RT_DEFAULT_TARGET_ARCH} + LIBRARY DESTINATION ${LLVM_LIBGCC_INSTALL_LIBRARY_DIR}/${install_dir_builtins} COMPONENT llvm-libgcc + ARCHIVE DESTINATION ${LLVM_LIBGCC_INSTALL_LIBRARY_DIR}/${install_dir_builtins} COMPONENT llvm-libgcc + RUNTIME DESTINATION ${LLVM_LIBGCC_INSTALL_RUNTIME_DIR}/${install_dir_builtins} COMPONENT llvm-libgcc) -add_subdirectory(lib) +foreach(VAR libgcc.a libgcc_eh.a libgcc_s.so.1.0 libgcc_s.so.1 libgcc_s.so) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${VAR} + DESTINATION ${LLVM_LIBGCC_INSTALL_LIBRARY_DIR} + COMPONENT llvm-libgcc) +endforeach() diff --git a/llvm-libgcc/lib/gcc_s.ver b/llvm-libgcc/gcc_s.ver.in similarity index 100% rename from llvm-libgcc/lib/gcc_s.ver rename to llvm-libgcc/gcc_s.ver.in diff --git a/llvm-libgcc/lib/CMakeLists.txt b/llvm-libgcc/lib/CMakeLists.txt deleted file mode 100644 index d895a21554b03..0000000000000 --- a/llvm-libgcc/lib/CMakeLists.txt +++ /dev/null @@ -1,86 +0,0 @@ -include(CheckLibraryExists) -include(GNUInstallDirs) -include(ExtendPath) - -string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}") - -add_custom_target(gcc_s_ver ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/gcc_s.ver") -set(LLVM_LIBGCC_GCC_S_VER "${CMAKE_CURRENT_BINARY_DIR}/gcc_s.ver") - -add_custom_target(gcc_s.ver ALL - DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/gcc_s.ver" - COMMAND - "${CMAKE_C_COMPILER}" - "-E" - "-xc" "${CMAKE_CURRENT_SOURCE_DIR}/gcc_s.ver" - "-o" "${CMAKE_CURRENT_BINARY_DIR}/gcc_s.ver" -) -set_target_properties(gcc_s.ver PROPERTIES - OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/gcc_s.ver") - -add_library(libgcc_s SHARED blank.c) -add_dependencies(libgcc_s gcc_s_ver) -set_target_properties(libgcc_s - PROPERTIES - LINKER_LANGUAGE C - OUTPUT_NAME "unwind" - VERSION "1.0" - SOVERSION "1" - POSITION_INDEPENDENT_CODE ON) -string(REGEX MATCH "[^-]+" LLVM_LIBGCC_TARGET_ARCH ${CMAKE_C_COMPILER_TARGET}) -target_link_libraries(libgcc_s PRIVATE - $ - $ -) -target_link_options(libgcc_s PRIVATE - -nostdlib - -Wl,--version-script,$) - -check_library_exists(m sin "" LLVM_LIBGCC_HAS_LIBM) -target_link_libraries(libgcc_s PRIVATE - $<$:m> - c -) - -extend_path(LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT "${CMAKE_INSTALL_PREFIX}" "${LIBUNWIND_INSTALL_LIBRARY_DIR}") -#string(REPLACE "${CMAKE_INSTALL_FULL_LIBDIR}/" "" LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT "${LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT}") - -install(TARGETS libgcc_s - LIBRARY DESTINATION "${LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT}" COMPONENT unwind - ARCHIVE DESTINATION "${LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT}" COMPONENT unwind - RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT unwind) - -get_compiler_rt_install_dir(${LLVM_LIBGCC_TARGET_ARCH} install_dir_builtins) -string(REGEX REPLACE "^lib/" "" install_dir_builtins "${install_dir_builtins}") -string(FIND "${install_dir_builtins}" "clang" install_path_contains_triple) -if(install_path_contains_triple EQUAL -1) - set(builtins_suffix "-${LLVM_LIBGCC_TARGET_ARCH}") -else() - string(PREPEND install_dir_builtins "../") -endif() -install(CODE "execute_process( - COMMAND \"\${CMAKE_COMMAND}\" -E - create_symlink ${install_dir_builtins}/libclang_rt.builtins${builtins_suffix}.a libgcc.a - WORKING_DIRECTORY \"\$ENV{DESTDIR}${LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT}\")" - COMPONENT unwind) - -install(CODE "execute_process( - COMMAND \"\${CMAKE_COMMAND}\" -E - create_symlink libunwind.a libgcc_eh.a - WORKING_DIRECTORY \"\$ENV{DESTDIR}${LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT}\")" - COMPONENT unwind) -install(CODE "execute_process( - COMMAND \"\${CMAKE_COMMAND}\" -E - create_symlink libunwind.so libgcc_s.so.1.0 - WORKING_DIRECTORY \"\$ENV{DESTDIR}${LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT}\")" - COMPONENT unwind) -install(CODE "execute_process( - COMMAND \"\${CMAKE_COMMAND}\" -E - create_symlink libgcc_s.so.1.0 libgcc_s.so.1 - WORKING_DIRECTORY \"\$ENV{DESTDIR}${LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT}\")" - COMPONENT unwind) -install(CODE "execute_process( - COMMAND \"\${CMAKE_COMMAND}\" -E - create_symlink libgcc_s.so.1 libgcc_s.so - WORKING_DIRECTORY \"\$ENV{DESTDIR}${LLVM_LIBGCC_LIBUNWIND_STATIC_ROOT}\")" - COMPONENT unwind) diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli index 4ec760a44571c..9e85f01df3835 100644 --- a/llvm/bindings/ocaml/llvm/llvm.mli +++ b/llvm/bindings/ocaml/llvm/llvm.mli @@ -2034,7 +2034,7 @@ val build_switch : llvalue -> llbasicblock -> int -> llbuilder -> llvalue (** [build_malloc ty name b] creates an [malloc] instruction at the position specified by the instruction builder [b]. - See the method [llvm::CallInst::CreateMalloc]. *) + See the method [llvm::IRBuilderBase::CreateMalloc]. *) val build_malloc : lltype -> string -> llbuilder -> llvalue (** [build_array_malloc ty val name b] creates an [array malloc] diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst index 28319660d69c7..5e2443cb612a4 100644 --- a/llvm/docs/CommandGuide/lit.rst +++ b/llvm/docs/CommandGuide/lit.rst @@ -94,21 +94,19 @@ OUTPUT OPTIONS Show more information on test failures, for example the entire test output instead of just the test result. + Each command is printed before it is executed. This can be valuable for + debugging test failures, as the last printed command is the one that failed. + Moreover, :program:`lit` inserts ``'RUN: at line N'`` before each + command pipeline in the output to help you locate the source line of + the failed command. + .. option:: -vv, --echo-all-commands - On test failure, echo all commands to stdout as they are being executed. - This can be valuable for debugging test failures, as the last echoed command - will be the one which has failed. - :program:`lit` normally inserts a no-op command (``:`` in the case of bash) - with argument ``'RUN: at line N'`` before each command pipeline, and this - option also causes those no-op commands to be echoed to stdout to help you - locate the source line of the failed command. - This option implies ``--verbose``. + Deprecated alias for -v. .. option:: -a, --show-all - Show more information about all tests, for example the entire test - commandline and output. + Enable -v, but for all tests not just failed tests. .. option:: --no-progress-bar diff --git a/llvm/examples/BrainF/BrainF.cpp b/llvm/examples/BrainF/BrainF.cpp index b3ff5574d3fc7..849e5c7675307 100644 --- a/llvm/examples/BrainF/BrainF.cpp +++ b/llvm/examples/BrainF/BrainF.cpp @@ -89,14 +89,12 @@ void BrainF::header(LLVMContext& C) { //%arr = malloc i8, i32 %d ConstantInt *val_mem = ConstantInt::get(C, APInt(32, memtotal)); - BasicBlock* BB = builder->GetInsertBlock(); Type* IntPtrTy = IntegerType::getInt32Ty(C); Type* Int8Ty = IntegerType::getInt8Ty(C); Constant* allocsize = ConstantExpr::getSizeOf(Int8Ty); allocsize = ConstantExpr::getTruncOrBitCast(allocsize, IntPtrTy); - ptr_arr = CallInst::CreateMalloc(BB, IntPtrTy, Int8Ty, allocsize, val_mem, - nullptr, "arr"); - cast(ptr_arr)->insertInto(BB, BB->end()); + ptr_arr = builder->CreateMalloc(IntPtrTy, Int8Ty, allocsize, val_mem, nullptr, + "arr"); //call void @llvm.memset.p0i8.i32(i8 *%arr, i8 0, i32 %d, i1 0) { @@ -127,8 +125,9 @@ void BrainF::header(LLVMContext& C) { //brainf.end: endbb = BasicBlock::Create(C, label, brainf_func); - //call free(i8 *%arr) - CallInst::CreateFree(ptr_arr, endbb)->insertInto(endbb, endbb->end()); + // call free(i8 *%arr) + builder->SetInsertPoint(endbb); + builder->CreateFree(ptr_arr); //ret void ReturnInst::Create(C, endbb); diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h index df0784664eadc..401119b3cb852 100644 --- a/llvm/include/llvm/Analysis/InstructionSimplify.h +++ b/llvm/include/llvm/Analysis/InstructionSimplify.h @@ -339,8 +339,14 @@ simplifyInstructionWithOperands(Instruction *I, ArrayRef NewOps, /// AllowRefinement specifies whether the simplification can be a refinement /// (e.g. 0 instead of poison), or whether it needs to be strictly identical. /// Op and RepOp can be assumed to not be poison when determining refinement. -Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, - const SimplifyQuery &Q, bool AllowRefinement); +/// +/// If DropFlags is passed, then the replacement result is only valid if +/// poison-generating flags/metadata on those instructions are dropped. This +/// is only useful in conjunction with AllowRefinement=false. +Value * +simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, + const SimplifyQuery &Q, bool AllowRefinement, + SmallVectorImpl *DropFlags = nullptr); /// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively. /// diff --git a/llvm/include/llvm/CodeGen/SlotIndexes.h b/llvm/include/llvm/CodeGen/SlotIndexes.h index 7e013dbf2ab38..863f1f585ef99 100644 --- a/llvm/include/llvm/CodeGen/SlotIndexes.h +++ b/llvm/include/llvm/CodeGen/SlotIndexes.h @@ -640,6 +640,9 @@ class raw_ostream; renumberIndexes(newItr); llvm::sort(idx2MBBMap, less_first()); } + + /// Renumber all indexes using the default instruction distance. + void packIndexes(); }; // Specialize IntervalMapInfo for half-open slot index intervals. diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index ef86eefdf33b8..86be257918cac 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -619,6 +619,22 @@ class IRBuilderBase { TBAATag, ScopeTag, NoAliasTag); } + CallInst *CreateMalloc(Type *IntPtrTy, Type *AllocTy, Value *AllocSize, + Value *ArraySize, ArrayRef OpB, + Function *MallocF = nullptr, const Twine &Name = ""); + + /// CreateMalloc - Generate the IR for a call to malloc: + /// 1. Compute the malloc call's argument as the specified type's size, + /// possibly multiplied by the array size if the array size is not + /// constant 1. + /// 2. Call malloc with that argument. + CallInst *CreateMalloc(Type *IntPtrTy, Type *AllocTy, Value *AllocSize, + Value *ArraySize, Function *MallocF = nullptr, + const Twine &Name = ""); + /// Generate the IR for a call to the builtin free function. + CallInst *CreateFree(Value *Source, + ArrayRef Bundles = std::nullopt); + CallInst *CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val, Value *Size, Align Alignment, uint32_t ElementSize, diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 5e8739812260d..a0c8406fe4ec1 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -1603,42 +1603,6 @@ class CallInst : public CallBase { static CallInst *Create(CallInst *CI, ArrayRef Bundles, Instruction *InsertPt = nullptr); - /// Generate the IR for a call to malloc: - /// 1. Compute the malloc call's argument as the specified type's size, - /// possibly multiplied by the array size if the array size is not - /// constant 1. - /// 2. Call malloc with that argument. - /// 3. Bitcast the result of the malloc call to the specified type. - static Instruction *CreateMalloc(Instruction *InsertBefore, Type *IntPtrTy, - Type *AllocTy, Value *AllocSize, - Value *ArraySize = nullptr, - Function *MallocF = nullptr, - const Twine &Name = ""); - static Instruction *CreateMalloc(BasicBlock *InsertAtEnd, Type *IntPtrTy, - Type *AllocTy, Value *AllocSize, - Value *ArraySize = nullptr, - Function *MallocF = nullptr, - const Twine &Name = ""); - static Instruction * - CreateMalloc(Instruction *InsertBefore, Type *IntPtrTy, Type *AllocTy, - Value *AllocSize, Value *ArraySize = nullptr, - ArrayRef Bundles = std::nullopt, - Function *MallocF = nullptr, const Twine &Name = ""); - static Instruction * - CreateMalloc(BasicBlock *InsertAtEnd, Type *IntPtrTy, Type *AllocTy, - Value *AllocSize, Value *ArraySize = nullptr, - ArrayRef Bundles = std::nullopt, - Function *MallocF = nullptr, const Twine &Name = ""); - /// Generate the IR for a call to the builtin free function. - static Instruction *CreateFree(Value *Source, Instruction *InsertBefore); - static Instruction *CreateFree(Value *Source, BasicBlock *InsertAtEnd); - static Instruction *CreateFree(Value *Source, - ArrayRef Bundles, - Instruction *InsertBefore); - static Instruction *CreateFree(Value *Source, - ArrayRef Bundles, - BasicBlock *InsertAtEnd); - // Note that 'musttail' implies 'tail'. enum TailCallKind : unsigned { TCK_None = 0, diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 2b8602f430dff..c06f77a05c051 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -12,6 +12,7 @@ let TargetPrefix = "spv" in { def int_spv_assign_type : Intrinsic<[], [llvm_any_ty, llvm_metadata_ty]>; + def int_spv_assign_ptr_type : Intrinsic<[], [llvm_any_ty, llvm_metadata_ty, llvm_i32_ty], [ImmArg>]>; def int_spv_assign_name : Intrinsic<[], [llvm_any_ty, llvm_vararg_ty]>; def int_spv_track_constant : Intrinsic<[llvm_any_ty], [llvm_any_ty, llvm_metadata_ty]>; diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h index 9e1e5111dbf72..6fb55f1cf1686 100644 --- a/llvm/include/llvm/LTO/Config.h +++ b/llvm/include/llvm/LTO/Config.h @@ -80,6 +80,12 @@ struct Config { /// link. bool HasWholeProgramVisibility = false; + /// We're validating that all native vtables have corresponding type infos. + bool ValidateAllVtablesHaveTypeInfos = false; + /// If all native vtables have corresponding type infos, allow + /// usage of RTTI to block devirtualization on types used in native files. + bool AllVtablesHaveTypeInfos = false; + /// Always emit a Regular LTO object even when it is empty because no Regular /// LTO modules were linked. This option is useful for some build system which /// want to know a priori all possible output files. diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h index cb8fa380a3aa3..59378bc10873e 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h @@ -252,6 +252,7 @@ class FunctionSpecializer { SmallPtrSet FullySpecialized; DenseMap FunctionMetrics; DenseMap FunctionGrowth; + unsigned NGlobals = 0; public: FunctionSpecializer( diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h index 9e121d9c6f4ed..0be3146f695a6 100644 --- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h +++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h @@ -243,10 +243,18 @@ void updatePublicTypeTestCalls(Module &M, bool WholeProgramVisibilityEnabledInLTO); void updateVCallVisibilityInModule( Module &M, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols); + const DenseSet &DynamicExportSymbols, + bool ValidateAllVtablesHaveTypeInfos, + function_ref IsVisibleToRegularObj); void updateVCallVisibilityInIndex( ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols); + const DenseSet &DynamicExportSymbols, + const DenseSet &VisibleToRegularObjSymbols); + +void getVisibleToRegularObjVtableGUIDs( + ModuleSummaryIndex &Index, + DenseSet &VisibleToRegularObjSymbols, + function_ref IsVisibleToRegularObj); /// Perform index-based whole program devirtualization on the \p Summary /// index. Any devirtualized targets used by a type test in another module diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index e8f96e9f681f2..d8aa614cae53b 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4299,6 +4299,7 @@ Value *llvm::simplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, const SimplifyQuery &Q, bool AllowRefinement, + SmallVectorImpl *DropFlags, unsigned MaxRecurse) { // Trivial replacement. if (V == Op) @@ -4333,7 +4334,7 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, bool AnyReplaced = false; for (Value *InstOp : I->operands()) { if (Value *NewInstOp = simplifyWithOpReplaced( - InstOp, Op, RepOp, Q, AllowRefinement, MaxRecurse)) { + InstOp, Op, RepOp, Q, AllowRefinement, DropFlags, MaxRecurse)) { NewOps.push_back(NewInstOp); AnyReplaced = InstOp != NewInstOp; } else { @@ -4427,16 +4428,23 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, // will be done in InstCombine). // TODO: This may be unsound, because it only catches some forms of // refinement. - if (!AllowRefinement && canCreatePoison(cast(I))) - return nullptr; + if (!AllowRefinement) { + if (canCreatePoison(cast(I), !DropFlags)) + return nullptr; + Constant *Res = ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI); + if (DropFlags && Res && I->hasPoisonGeneratingFlagsOrMetadata()) + DropFlags->push_back(I); + return Res; + } return ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI); } Value *llvm::simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, const SimplifyQuery &Q, - bool AllowRefinement) { - return ::simplifyWithOpReplaced(V, Op, RepOp, Q, AllowRefinement, + bool AllowRefinement, + SmallVectorImpl *DropFlags) { + return ::simplifyWithOpReplaced(V, Op, RepOp, Q, AllowRefinement, DropFlags, RecursionLimit); } @@ -4569,11 +4577,11 @@ static Value *simplifySelectWithICmpEq(Value *CmpLHS, Value *CmpRHS, unsigned MaxRecurse) { if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, /* AllowRefinement */ false, - MaxRecurse) == TrueVal) + /* DropFlags */ nullptr, MaxRecurse) == TrueVal) return FalseVal; if (simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, /* AllowRefinement */ true, - MaxRecurse) == FalseVal) + /* DropFlags */ nullptr, MaxRecurse) == FalseVal) return FalseVal; return nullptr; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 8a779ac9fb94f..245cce25a3d57 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -942,6 +942,22 @@ static void findForkedSCEVs( ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)); break; } + case Instruction::PHI: { + SmallVector, 2> ChildScevs; + // A phi means we've found a forked pointer, but we currently only + // support a single phi per pointer so if there's another behind this + // then we just bail out and return the generic SCEV. + if (I->getNumOperands() == 2) { + findForkedSCEVs(SE, L, I->getOperand(0), ChildScevs, Depth); + findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth); + } + if (ChildScevs.size() == 2) { + ScevList.push_back(ChildScevs[0]); + ScevList.push_back(ChildScevs[1]); + } else + ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)); + break; + } case Instruction::Add: case Instruction::Sub: { SmallVector> LScevs; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index a5e8024090866..0c3f558ac2a64 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -6586,23 +6586,25 @@ LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) { // round(x) => // t = trunc(x); // d = fabs(x - t); - // o = copysign(1.0f, x); - // return t + (d >= 0.5 ? o : 0.0); + // o = copysign(d >= 0.5 ? 1.0 : 0.0, x); + // return t + o; auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags); auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags); auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags); - auto Zero = MIRBuilder.buildFConstant(Ty, 0.0); - auto One = MIRBuilder.buildFConstant(Ty, 1.0); + auto Half = MIRBuilder.buildFConstant(Ty, 0.5); - auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X); + auto Cmp = + MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags); - auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, - Flags); - auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags); + // Could emit G_UITOFP instead + auto One = MIRBuilder.buildFConstant(Ty, 1.0); + auto Zero = MIRBuilder.buildFConstant(Ty, 0.0); + auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero); + auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X); - MIRBuilder.buildFAdd(DstReg, T, Sel, Flags); + MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags); MI.eraseFromParent(); return Legalized; diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index f97cb1a0fb722..36625c4848c53 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2692,6 +2692,9 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { return false; Indexes = &getAnalysis(); + // Renumber to get accurate and consistent results from + // SlotIndexes::getApproxInstrDistance. + Indexes->packIndexes(); MBFI = &getAnalysis(); DomTree = &getAnalysis(); ORE = &getAnalysis().getORE(); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 484a6231b7f65..db1ebe0e26b9a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -539,6 +539,8 @@ namespace { SDValue visitMSCATTER(SDNode *N); SDValue visitVPGATHER(SDNode *N); SDValue visitVPSCATTER(SDNode *N); + SDValue visitVP_STRIDED_LOAD(SDNode *N); + SDValue visitVP_STRIDED_STORE(SDNode *N); SDValue visitFP_TO_FP16(SDNode *N); SDValue visitFP16_TO_FP(SDNode *N); SDValue visitFP_TO_BF16(SDNode *N); @@ -11872,6 +11874,21 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVP_STRIDED_STORE(SDNode *N) { + auto *SST = cast(N); + EVT EltVT = SST->getValue().getValueType().getVectorElementType(); + // Combine strided stores with unit-stride to a regular VP store. + if (auto *CStride = dyn_cast(SST->getStride()); + CStride && CStride->getZExtValue() == EltVT.getStoreSize()) { + return DAG.getStoreVP(SST->getChain(), SDLoc(N), SST->getValue(), + SST->getBasePtr(), SST->getOffset(), SST->getMask(), + SST->getVectorLength(), SST->getMemoryVT(), + SST->getMemOperand(), SST->getAddressingMode(), + SST->isTruncatingStore(), SST->isCompressingStore()); + } + return SDValue(); +} + SDValue DAGCombiner::visitVPGATHER(SDNode *N) { VPGatherSDNode *MGT = cast(N); SDValue Mask = MGT->getMask(); @@ -11959,6 +11976,22 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) { + auto *SLD = cast(N); + EVT EltVT = SLD->getValueType(0).getVectorElementType(); + // Combine strided loads with unit-stride to a regular VP load. + if (auto *CStride = dyn_cast(SLD->getStride()); + CStride && CStride->getZExtValue() == EltVT.getStoreSize()) { + SDValue NewLd = DAG.getLoadVP( + SLD->getAddressingMode(), SLD->getExtensionType(), SLD->getValueType(0), + SDLoc(N), SLD->getChain(), SLD->getBasePtr(), SLD->getOffset(), + SLD->getMask(), SLD->getVectorLength(), SLD->getMemoryVT(), + SLD->getMemOperand(), SLD->isExpandingLoad()); + return CombineTo(N, NewLd, NewLd.getValue(1)); + } + return SDValue(); +} + /// A vector select of 2 constant vectors can be simplified to math/logic to /// avoid a variable select instruction and possibly avoid constant loads. SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) { @@ -25976,6 +26009,14 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) { if (SDValue SD = visitVPSCATTER(N)) return SD; + if (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD) + if (SDValue SD = visitVP_STRIDED_LOAD(N)) + return SD; + + if (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE) + if (SDValue SD = visitVP_STRIDED_STORE(N)) + return SD; + // VP operations in which all vector elements are disabled - either by // determining that the mask is all false or that the EVL is 0 - can be // eliminated. diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp index 47ee36971d0ea..65726f06dedb4 100644 --- a/llvm/lib/CodeGen/SlotIndexes.cpp +++ b/llvm/lib/CodeGen/SlotIndexes.cpp @@ -237,6 +237,11 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, } } +void SlotIndexes::packIndexes() { + for (auto [Index, Entry] : enumerate(indexList)) + Entry.setIndex(Index * SlotIndex::InstrDist); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void SlotIndexes::dump() const { for (const IndexListEntry &ILE : indexList) { diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 560a0a4fbac66..4ae396723ef09 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1566,7 +1566,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, MachineOperand &MO = MI->getOperand(SrcIdx); assert(MO.isReg() && MO.getReg() == RegB && MO.isUse() && "inconsistent operand info for 2-reg pass"); - if (MO.isKill()) { + if (isPlainlyKilled(MO)) { MO.setIsKill(false); RemovedKillFlag = true; } @@ -1587,7 +1587,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, for (MachineOperand &MO : MI->all_uses()) { if (MO.getReg() == RegB) { if (MO.getSubReg() == SubRegB && !IsEarlyClobber) { - if (MO.isKill()) { + if (isPlainlyKilled(MO)) { MO.setIsKill(false); RemovedKillFlag = true; } diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 17093fa0ac4ee..71aec8bdf6f35 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -3534,10 +3534,8 @@ LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty, Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext()); Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty)); AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy); - Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), - ITy, unwrap(Ty), AllocSize, - nullptr, nullptr, ""); - return wrap(unwrap(B)->Insert(Malloc, Twine(Name))); + return wrap(unwrap(B)->CreateMalloc(ITy, unwrap(Ty), AllocSize, nullptr, + nullptr, Name)); } LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty, @@ -3545,10 +3543,8 @@ LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty, Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext()); Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty)); AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy); - Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), - ITy, unwrap(Ty), AllocSize, - unwrap(Val), nullptr, ""); - return wrap(unwrap(B)->Insert(Malloc, Twine(Name))); + return wrap(unwrap(B)->CreateMalloc(ITy, unwrap(Ty), AllocSize, unwrap(Val), + nullptr, Name)); } LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr, @@ -3587,8 +3583,7 @@ LLVMValueRef LLVMBuildArrayAlloca(LLVMBuilderRef B, LLVMTypeRef Ty, } LLVMValueRef LLVMBuildFree(LLVMBuilderRef B, LLVMValueRef PointerVal) { - return wrap(unwrap(B)->Insert( - CallInst::CreateFree(unwrap(PointerVal), unwrap(B)->GetInsertBlock()))); + return wrap(unwrap(B)->CreateFree(unwrap(PointerVal))); } LLVMValueRef LLVMBuildLoad2(LLVMBuilderRef B, LLVMTypeRef Ty, diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 974e29841e1bc..b321d8b325fe0 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -291,6 +291,84 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy( return CI; } +/// isConstantOne - Return true only if val is constant int 1 +static bool isConstantOne(const Value *Val) { + assert(Val && "isConstantOne does not work with nullptr Val"); + const ConstantInt *CVal = dyn_cast(Val); + return CVal && CVal->isOne(); +} + +CallInst *IRBuilderBase::CreateMalloc(Type *IntPtrTy, Type *AllocTy, + Value *AllocSize, Value *ArraySize, + ArrayRef OpB, + Function *MallocF, const Twine &Name) { + // malloc(type) becomes: + // i8* malloc(typeSize) + // malloc(type, arraySize) becomes: + // i8* malloc(typeSize*arraySize) + if (!ArraySize) + ArraySize = ConstantInt::get(IntPtrTy, 1); + else if (ArraySize->getType() != IntPtrTy) + ArraySize = CreateIntCast(ArraySize, IntPtrTy, false); + + if (!isConstantOne(ArraySize)) { + if (isConstantOne(AllocSize)) { + AllocSize = ArraySize; // Operand * 1 = Operand + } else { + // Multiply type size by the array size... + AllocSize = CreateMul(ArraySize, AllocSize, "mallocsize"); + } + } + + assert(AllocSize->getType() == IntPtrTy && "malloc arg is wrong size"); + // Create the call to Malloc. + Module *M = BB->getParent()->getParent(); + Type *BPTy = PointerType::getUnqual(Context); + FunctionCallee MallocFunc = MallocF; + if (!MallocFunc) + // prototype malloc as "void *malloc(size_t)" + MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy); + CallInst *MCall = CreateCall(MallocFunc, AllocSize, OpB, Name); + + MCall->setTailCall(); + if (Function *F = dyn_cast(MallocFunc.getCallee())) { + MCall->setCallingConv(F->getCallingConv()); + F->setReturnDoesNotAlias(); + } + + assert(!MCall->getType()->isVoidTy() && "Malloc has void return type"); + + return MCall; +} + +CallInst *IRBuilderBase::CreateMalloc(Type *IntPtrTy, Type *AllocTy, + Value *AllocSize, Value *ArraySize, + Function *MallocF, const Twine &Name) { + + return CreateMalloc(IntPtrTy, AllocTy, AllocSize, ArraySize, std::nullopt, + MallocF, Name); +} + +/// CreateFree - Generate the IR for a call to the builtin free function. +CallInst *IRBuilderBase::CreateFree(Value *Source, + ArrayRef Bundles) { + assert(Source->getType()->isPointerTy() && + "Can not free something of nonpointer type!"); + + Module *M = BB->getParent()->getParent(); + + Type *VoidTy = Type::getVoidTy(M->getContext()); + Type *VoidPtrTy = PointerType::getUnqual(M->getContext()); + // prototype free as "void free(void*)" + FunctionCallee FreeFunc = M->getOrInsertFunction("free", VoidTy, VoidPtrTy); + CallInst *Result = CreateCall(FreeFunc, Source, Bundles, ""); + Result->setTailCall(); + if (Function *F = dyn_cast(FreeFunc.getCallee())) + Result->setCallingConv(F->getCallingConv()); + + return Result; +} + CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove( Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, uint32_t ElementSize, MDNode *TBAATag, MDNode *TBAAStructTag, diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 72d3c85e5c9fe..81dfa14f2e6b5 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -809,186 +809,6 @@ void CallInst::updateProfWeight(uint64_t S, uint64_t T) { setMetadata(LLVMContext::MD_prof, MDNode::get(getContext(), Vals)); } -/// IsConstantOne - Return true only if val is constant int 1 -static bool IsConstantOne(Value *val) { - assert(val && "IsConstantOne does not work with nullptr val"); - const ConstantInt *CVal = dyn_cast(val); - return CVal && CVal->isOne(); -} - -static Instruction *createMalloc(Instruction *InsertBefore, - BasicBlock *InsertAtEnd, Type *IntPtrTy, - Type *AllocTy, Value *AllocSize, - Value *ArraySize, - ArrayRef OpB, - Function *MallocF, const Twine &Name) { - assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) && - "createMalloc needs either InsertBefore or InsertAtEnd"); - - // malloc(type) becomes: - // bitcast (i8* malloc(typeSize)) to type* - // malloc(type, arraySize) becomes: - // bitcast (i8* malloc(typeSize*arraySize)) to type* - if (!ArraySize) - ArraySize = ConstantInt::get(IntPtrTy, 1); - else if (ArraySize->getType() != IntPtrTy) { - if (InsertBefore) - ArraySize = CastInst::CreateIntegerCast(ArraySize, IntPtrTy, false, - "", InsertBefore); - else - ArraySize = CastInst::CreateIntegerCast(ArraySize, IntPtrTy, false, - "", InsertAtEnd); - } - - if (!IsConstantOne(ArraySize)) { - if (IsConstantOne(AllocSize)) { - AllocSize = ArraySize; // Operand * 1 = Operand - } else if (Constant *CO = dyn_cast(ArraySize)) { - Constant *Scale = ConstantExpr::getIntegerCast(CO, IntPtrTy, - false /*ZExt*/); - // Malloc arg is constant product of type size and array size - AllocSize = ConstantExpr::getMul(Scale, cast(AllocSize)); - } else { - // Multiply type size by the array size... - if (InsertBefore) - AllocSize = BinaryOperator::CreateMul(ArraySize, AllocSize, - "mallocsize", InsertBefore); - else - AllocSize = BinaryOperator::CreateMul(ArraySize, AllocSize, - "mallocsize", InsertAtEnd); - } - } - - assert(AllocSize->getType() == IntPtrTy && "malloc arg is wrong size"); - // Create the call to Malloc. - BasicBlock *BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd; - Module *M = BB->getParent()->getParent(); - Type *BPTy = PointerType::getUnqual(BB->getContext()); - FunctionCallee MallocFunc = MallocF; - if (!MallocFunc) - // prototype malloc as "void *malloc(size_t)" - MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy); - CallInst *MCall = nullptr; - if (InsertBefore) { - MCall = CallInst::Create(MallocFunc, AllocSize, OpB, Name, - InsertBefore); - } else { - MCall = CallInst::Create(MallocFunc, AllocSize, OpB, Name); - } - MCall->setTailCall(); - if (Function *F = dyn_cast(MallocFunc.getCallee())) { - MCall->setCallingConv(F->getCallingConv()); - if (!F->returnDoesNotAlias()) - F->setReturnDoesNotAlias(); - } - assert(!MCall->getType()->isVoidTy() && "Malloc has void return type"); - - return MCall; -} - -/// CreateMalloc - Generate the IR for a call to malloc: -/// 1. Compute the malloc call's argument as the specified type's size, -/// possibly multiplied by the array size if the array size is not -/// constant 1. -/// 2. Call malloc with that argument. -/// 3. Bitcast the result of the malloc call to the specified type. -Instruction *CallInst::CreateMalloc(Instruction *InsertBefore, - Type *IntPtrTy, Type *AllocTy, - Value *AllocSize, Value *ArraySize, - Function *MallocF, - const Twine &Name) { - return createMalloc(InsertBefore, nullptr, IntPtrTy, AllocTy, AllocSize, - ArraySize, std::nullopt, MallocF, Name); -} -Instruction *CallInst::CreateMalloc(Instruction *InsertBefore, - Type *IntPtrTy, Type *AllocTy, - Value *AllocSize, Value *ArraySize, - ArrayRef OpB, - Function *MallocF, - const Twine &Name) { - return createMalloc(InsertBefore, nullptr, IntPtrTy, AllocTy, AllocSize, - ArraySize, OpB, MallocF, Name); -} - -/// CreateMalloc - Generate the IR for a call to malloc: -/// 1. Compute the malloc call's argument as the specified type's size, -/// possibly multiplied by the array size if the array size is not -/// constant 1. -/// 2. Call malloc with that argument. -/// 3. Bitcast the result of the malloc call to the specified type. -/// Note: This function does not add the bitcast to the basic block, that is the -/// responsibility of the caller. -Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd, - Type *IntPtrTy, Type *AllocTy, - Value *AllocSize, Value *ArraySize, - Function *MallocF, const Twine &Name) { - return createMalloc(nullptr, InsertAtEnd, IntPtrTy, AllocTy, AllocSize, - ArraySize, std::nullopt, MallocF, Name); -} -Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd, - Type *IntPtrTy, Type *AllocTy, - Value *AllocSize, Value *ArraySize, - ArrayRef OpB, - Function *MallocF, const Twine &Name) { - return createMalloc(nullptr, InsertAtEnd, IntPtrTy, AllocTy, AllocSize, - ArraySize, OpB, MallocF, Name); -} - -static Instruction *createFree(Value *Source, - ArrayRef Bundles, - Instruction *InsertBefore, - BasicBlock *InsertAtEnd) { - assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) && - "createFree needs either InsertBefore or InsertAtEnd"); - assert(Source->getType()->isPointerTy() && - "Can not free something of nonpointer type!"); - - BasicBlock *BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd; - Module *M = BB->getParent()->getParent(); - - Type *VoidTy = Type::getVoidTy(M->getContext()); - Type *VoidPtrTy = PointerType::getUnqual(M->getContext()); - // prototype free as "void free(void*)" - FunctionCallee FreeFunc = M->getOrInsertFunction("free", VoidTy, VoidPtrTy); - CallInst *Result = nullptr; - if (InsertBefore) - Result = CallInst::Create(FreeFunc, Source, Bundles, "", InsertBefore); - else - Result = CallInst::Create(FreeFunc, Source, Bundles, ""); - Result->setTailCall(); - if (Function *F = dyn_cast(FreeFunc.getCallee())) - Result->setCallingConv(F->getCallingConv()); - - return Result; -} - -/// CreateFree - Generate the IR for a call to the builtin free function. -Instruction *CallInst::CreateFree(Value *Source, Instruction *InsertBefore) { - return createFree(Source, std::nullopt, InsertBefore, nullptr); -} -Instruction *CallInst::CreateFree(Value *Source, - ArrayRef Bundles, - Instruction *InsertBefore) { - return createFree(Source, Bundles, InsertBefore, nullptr); -} - -/// CreateFree - Generate the IR for a call to the builtin free function. -/// Note: This function does not add the call to the basic block, that is the -/// responsibility of the caller. -Instruction *CallInst::CreateFree(Value *Source, BasicBlock *InsertAtEnd) { - Instruction *FreeCall = - createFree(Source, std::nullopt, nullptr, InsertAtEnd); - assert(FreeCall && "CreateFree did not create a CallInst"); - return FreeCall; -} -Instruction *CallInst::CreateFree(Value *Source, - ArrayRef Bundles, - BasicBlock *InsertAtEnd) { - Instruction *FreeCall = createFree(Source, Bundles, nullptr, InsertAtEnd); - assert(FreeCall && "CreateFree did not create a CallInst"); - return FreeCall; -} - //===----------------------------------------------------------------------===// // InvokeInst Implementation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 3e008edd3ee0b..4a64aa4593d54 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1270,13 +1270,27 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex); + bool WholeProgramVisibilityEnabledInLTO = + Conf.HasWholeProgramVisibility && + // If validation is enabled, upgrade visibility only when all vtables + // have typeinfos. + (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos); + + // This returns true when the name is local or not defined. Locals are + // expected to be handled separately. + auto IsVisibleToRegularObj = [&](StringRef name) { + auto It = GlobalResolutions.find(name); + return (It == GlobalResolutions.end() || It->second.VisibleOutsideSummary); + }; + // If allowed, upgrade public vcall visibility metadata to linkage unit // visibility before whole program devirtualization in the optimizer. - updateVCallVisibilityInModule(*RegularLTO.CombinedModule, - Conf.HasWholeProgramVisibility, - DynamicExportSymbols); + updateVCallVisibilityInModule( + *RegularLTO.CombinedModule, WholeProgramVisibilityEnabledInLTO, + DynamicExportSymbols, Conf.ValidateAllVtablesHaveTypeInfos, + IsVisibleToRegularObj); updatePublicTypeTestCalls(*RegularLTO.CombinedModule, - Conf.HasWholeProgramVisibility); + WholeProgramVisibilityEnabledInLTO); if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule)) @@ -1683,13 +1697,38 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, std::set ExportedGUIDs; - if (hasWholeProgramVisibility(Conf.HasWholeProgramVisibility)) + bool WholeProgramVisibilityEnabledInLTO = + Conf.HasWholeProgramVisibility && + // If validation is enabled, upgrade visibility only when all vtables + // have typeinfos. + (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos); + if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) ThinLTO.CombinedIndex.setWithWholeProgramVisibility(); + + // If we're validating, get the vtable symbols that should not be + // upgraded because they correspond to typeIDs outside of index-based + // WPD info. + DenseSet VisibleToRegularObjSymbols; + if (WholeProgramVisibilityEnabledInLTO && + Conf.ValidateAllVtablesHaveTypeInfos) { + // This returns true when the name is local or not defined. Locals are + // expected to be handled separately. + auto IsVisibleToRegularObj = [&](StringRef name) { + auto It = GlobalResolutions.find(name); + return (It == GlobalResolutions.end() || + It->second.VisibleOutsideSummary); + }; + + getVisibleToRegularObjVtableGUIDs(ThinLTO.CombinedIndex, + VisibleToRegularObjSymbols, + IsVisibleToRegularObj); + } + // If allowed, upgrade public vcall visibility to linkage unit visibility in // the summaries before whole program devirtualization below. - updateVCallVisibilityInIndex(ThinLTO.CombinedIndex, - Conf.HasWholeProgramVisibility, - DynamicExportSymbols); + updateVCallVisibilityInIndex( + ThinLTO.CombinedIndex, WholeProgramVisibilityEnabledInLTO, + DynamicExportSymbols, VisibleToRegularObjSymbols); // Perform index-based WPD. This will return immediately if there are // no index entries in the typeIdMetadata map (e.g. if we are instead diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index 52a4a9ba85e81..52d8fff14be9c 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -605,11 +605,14 @@ bool LTOCodeGenerator::optimize() { // pipeline run below. updatePublicTypeTestCalls(*MergedModule, /* WholeProgramVisibilityEnabledInLTO */ false); - updateVCallVisibilityInModule(*MergedModule, - /* WholeProgramVisibilityEnabledInLTO */ false, - // FIXME: This needs linker information via a - // TBD new interface. - /* DynamicExportSymbols */ {}); + updateVCallVisibilityInModule( + *MergedModule, + /* WholeProgramVisibilityEnabledInLTO */ false, + // FIXME: These need linker information via a + // TBD new interface. + /*DynamicExportSymbols=*/{}, + /*ValidateAllVtablesHaveTypeInfos=*/false, + /*IsVisibleToRegularObj=*/[](StringRef) { return true; }); // We always run the verifier once on the merged module, the `DisableVerify` // parameter only applies to subsequent verify. diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index 02a45351462da..acff1e2cf3b33 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -1057,11 +1057,14 @@ void ThinLTOCodeGenerator::run() { // via the internal option. Must be done before WPD below. if (hasWholeProgramVisibility(/* WholeProgramVisibilityEnabledInLTO */ false)) Index->setWithWholeProgramVisibility(); + + // FIXME: This needs linker information via a TBD new interface updateVCallVisibilityInIndex(*Index, - /* WholeProgramVisibilityEnabledInLTO */ false, - // FIXME: This needs linker information via a + /*WholeProgramVisibilityEnabledInLTO=*/false, + // FIXME: These need linker information via a // TBD new interface. - /* DynamicExportSymbols */ {}); + /*DynamicExportSymbols=*/{}, + /*VisibleToRegularObjSymbols=*/{}); // Perform index-based WPD. This will return immediately if there are // no index entries in the typeIdMetadata map (e.g. if we are instead diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5cc001c44e7a2..576339cfbd00a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5335,7 +5335,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_neon_uaddlv: { EVT OpVT = Op.getOperand(1).getValueType(); EVT ResVT = Op.getValueType(); - if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) { + if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || + OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) { // In order to avoid insert_subvector, used v4i32 than v2i32. SDValue UADDLV = DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1)); @@ -22273,6 +22274,7 @@ static SDValue performSelectCombine(SDNode *N, static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); + SDLoc DL(N); // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the // 128bit vector version. if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) { @@ -22280,14 +22282,32 @@ static SDValue performDUPCombine(SDNode *N, SmallVector Ops(N->ops()); if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(), DCI.DAG.getVTList(LVT), Ops)) { - SDLoc DL(N); return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0), DCI.DAG.getConstant(0, DL, MVT::i64)); } } - if (N->getOpcode() == AArch64ISD::DUP) + if (N->getOpcode() == AArch64ISD::DUP) { + if (DCI.isAfterLegalizeDAG()) { + // If scalar dup's operand is extract_vector_elt, try to combine them into + // duplane. For example, + // + // t21: i32 = extract_vector_elt t19, Constant:i64<0> + // t18: v4i32 = AArch64ISD::DUP t21 + // ==> + // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0> + SDValue EXTRACT_VEC_ELT = N->getOperand(0); + if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) { + unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); + return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0), + EXTRACT_VEC_ELT.getOperand(1)); + } + } + } + return performPostLD1Combine(N, DCI, false); + } return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5ad2e4837e3db..bceea75f27822 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2234,6 +2234,7 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: + case AArch64::LDR_PXI: if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { FrameIndex = MI.getOperand(1).getIndex(); @@ -2257,7 +2258,6 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: - case AArch64::LDR_PXI: case AArch64::STR_PXI: if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 90655aca77f7d..8954b563238a2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6472,12 +6472,24 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))) (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)), ssub))>; +def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>; + +def : Pat<(v4i32 (AArch64uaddlv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub))>; + def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))), (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>; +def : Pat<(v4i32 (AArch64uaddlv (v4i16 V64:$Rn))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv4i16v V64:$Rn), ssub))>; + def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))), (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>; +def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))), + (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>; + // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. // In effect, opNode is the same as (scalar_to_vector (IntNode)). diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 9698b6e495c4a..42479b8970c34 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2429,18 +2429,16 @@ SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); const SDValue One = DAG.getConstantFP(1.0, SL, VT); - const SDValue Half = DAG.getConstantFP(0.5, SL, VT); - - SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X); EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + const SDValue Half = DAG.getConstantFP(0.5, SL, VT); SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero); - SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero); - - return DAG.getNode(ISD::FADD, SL, VT, T, Sel); + SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X); + return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset); } SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 6a87397862e4a..abe2f5a6246fc 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -27,6 +27,7 @@ add_public_tablegen_target(RISCVCommonTableGen) add_llvm_target(RISCVCodeGen RISCVAsmPrinter.cpp RISCVCodeGenPrepare.cpp + RISCVDeadRegisterDefinitions.cpp RISCVMakeCompressible.cpp RISCVExpandAtomicPseudoInsts.cpp RISCVExpandPseudoInsts.cpp diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index b63a5cea823e4..716c3ac14d116 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -442,8 +442,11 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, RelaxCandidate = true; break; } - } else if (Kind == MCExpr::SymbolRef && - cast(Expr)->getKind() == MCSymbolRefExpr::VK_None) { + } else if ((Kind == MCExpr::SymbolRef && + cast(Expr)->getKind() == + MCSymbolRefExpr::VK_None) || + Kind == MCExpr::Binary) { + // FIXME: Sub kind binary exprs have chance of underflow. if (MIFrm == RISCVII::InstFormatJ) { FixupKind = RISCV::fixup_riscv_jal; } else if (MIFrm == RISCVII::InstFormatB) { diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h index 35c73d07a7d3c..702f69bcd4780 100644 --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -33,6 +33,9 @@ class RISCVTargetMachine; FunctionPass *createRISCVCodeGenPreparePass(); void initializeRISCVCodeGenPreparePass(PassRegistry &); +FunctionPass *createRISCVDeadRegisterDefinitionsPass(); +void initializeRISCVDeadRegisterDefinitionsPass(PassRegistry &); + FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM, CodeGenOptLevel OptLevel); diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 17326c68f0bd3..526520e6c5267 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -236,7 +236,7 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, // RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand). if (!AddrReg.isReg()) return true; - if (!Offset.isImm() && !Offset.isGlobal()) + if (!Offset.isImm() && !Offset.isGlobal() && !Offset.isBlockAddress()) return true; MCOperand MCO; @@ -245,7 +245,7 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, if (Offset.isImm()) OS << MCO.getImm(); - else if (Offset.isGlobal()) + else if (Offset.isGlobal() || Offset.isBlockAddress()) OS << *MCO.getExpr(); OS << "(" << RISCVInstPrinter::getRegisterName(AddrReg.getReg()) << ")"; return false; diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp new file mode 100644 index 0000000000000..7099f36dcd43c --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp @@ -0,0 +1,106 @@ +//===- RISCVDeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// +// +// This pass rewrites Rd to x0 for instrs whose return values are unused. +// +//===---------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVSubtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; +#define DEBUG_TYPE "riscv-dead-defs" +#define RISCV_DEAD_REG_DEF_NAME "RISC-V Dead register definitions" + +STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced"); + +namespace { +class RISCVDeadRegisterDefinitions : public MachineFunctionPass { +public: + static char ID; + + RISCVDeadRegisterDefinitions() : MachineFunctionPass(ID) { + initializeRISCVDeadRegisterDefinitionsPass( + *PassRegistry::getPassRegistry()); + } + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return RISCV_DEAD_REG_DEF_NAME; } +}; +} // end anonymous namespace + +char RISCVDeadRegisterDefinitions::ID = 0; +INITIALIZE_PASS(RISCVDeadRegisterDefinitions, DEBUG_TYPE, + RISCV_DEAD_REG_DEF_NAME, false, false) + +FunctionPass *llvm::createRISCVDeadRegisterDefinitionsPass() { + return new RISCVDeadRegisterDefinitions(); +} + +bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + LLVM_DEBUG(dbgs() << "***** RISCVDeadRegisterDefinitions *****\n"); + + bool MadeChange = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // We only handle non-computational instructions since some NOP encodings + // are reserved for HINT instructions. + const MCInstrDesc &Desc = MI.getDesc(); + if (!Desc.mayLoad() && !Desc.mayStore() && + !Desc.hasUnmodeledSideEffects()) + continue; + // For PseudoVSETVLIX0, Rd = X0 has special meaning. + if (MI.getOpcode() == RISCV::PseudoVSETVLIX0) + continue; + for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) { + MachineOperand &MO = MI.getOperand(I); + if (!MO.isReg() || !MO.isDef() || MO.isEarlyClobber()) + continue; + // Be careful not to change the register if it's a tied operand. + if (MI.isRegTiedToUseOperand(I)) { + LLVM_DEBUG(dbgs() << " Ignoring, def is tied operand.\n"); + continue; + } + // We should not have any relevant physreg defs that are replacable by + // zero before register allocation. So we just check for dead vreg defs. + Register Reg = MO.getReg(); + if (!Reg.isVirtual() || (!MO.isDead() && !MRI->use_nodbg_empty(Reg))) + continue; + LLVM_DEBUG(dbgs() << " Dead def operand #" << I << " in:\n "; + MI.print(dbgs())); + const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI, MF); + if (!(RC && RC->contains(RISCV::X0))) { + LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n"); + continue; + } + MO.setReg(RISCV::X0); + MO.setIsDead(); + LLVM_DEBUG(dbgs() << " Replacing with zero register. New:\n "; + MI.print(dbgs())); + ++NumDeadDefsReplaced; + MadeChange = true; + } + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 6381263b37613..5231f3c3cf3df 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -549,7 +549,7 @@ def HasStdExtSvinval : Predicate<"Subtarget->hasStdExtSvinval()">, def FeatureStdExtZtso : SubtargetFeature<"experimental-ztso", "HasStdExtZtso", "true", "'Ztso' (Memory Model - Total Store Order)">; -def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZTso()">, +def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZtso()">, AssemblerPredicate<(all_of FeatureStdExtZtso), "'Ztso' (Memory Model - Total Store Order)">; def NotHasStdExtZtso : Predicate<"!Subtarget->hasStdExtZtso()">; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index add933250f847..cbcf41a979c9e 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -276,16 +276,6 @@ static Register getMaxPushPopReg(const MachineFunction &MF, return MaxPushPopReg; } -static uint64_t adjSPInPushPop(MachineBasicBlock::iterator MBBI, - unsigned RequiredStack, unsigned FreePushStack, - bool IsPop) { - if (FreePushStack > RequiredStack) - RequiredStack = 0; - unsigned Spimm = std::min(RequiredStack, 48u); - MBBI->getOperand(1).setImm(Spimm); - return alignTo(RequiredStack - Spimm, 16); -} - // Return true if the specified function should have a dedicated frame // pointer register. This is true if frame pointer elimination is // disabled, if it needs dynamic stack realignment, if the function has @@ -539,10 +529,9 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, if (RVFI->isPushable(MF) && FirstFrameSetup->getOpcode() == RISCV::CM_PUSH) { // Use available stack adjustment in push instruction to allocate additional // stack space. - unsigned PushStack = RVFI->getRVPushRegs() * (STI.getXLen() / 8); - unsigned SpImmBase = RVFI->getRVPushStackSize(); - StackSize = adjSPInPushPop(FirstFrameSetup, StackSize, - (SpImmBase - PushStack), true); + uint64_t Spimm = std::min(StackSize, (uint64_t)48); + FirstFrameSetup->getOperand(1).setImm(Spimm); + StackSize -= Spimm; } if (StackSize != 0) { @@ -777,9 +766,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, MBBI->getOpcode() == RISCV::CM_POP) { // Use available stack adjustment in pop instruction to deallocate stack // space. - unsigned PushStack = RVFI->getRVPushRegs() * (STI.getXLen() / 8); - unsigned SpImmBase = RVFI->getRVPushStackSize(); - StackSize = adjSPInPushPop(MBBI, StackSize, (SpImmBase - PushStack), true); + uint64_t Spimm = std::min(StackSize, (uint64_t)48); + MBBI->getOperand(1).setImm(Spimm); + StackSize -= Spimm; } // Deallocate stack diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e74d184c0a35d..0ab3b591c7b1a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13589,6 +13589,52 @@ static bool matchIndexAsShuffle(EVT VT, SDValue Index, SDValue Mask, return ActiveLanes.all(); } +/// Match the index of a gather or scatter operation as an operation +/// with twice the element width and half the number of elements. This is +/// generally profitable (if legal) because these operations are linear +/// in VL, so even if we cause some extract VTYPE/VL toggles, we still +/// come out ahead. +static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask, + Align BaseAlign, const RISCVSubtarget &ST) { + if (!ISD::isConstantSplatVectorAllOnes(Mask.getNode())) + return false; + if (!ISD::isBuildVectorOfConstantSDNodes(Index.getNode())) + return false; + + // Attempt a doubling. If we can use a element type 4x or 8x in + // size, this will happen via multiply iterations of the transform. + const unsigned NumElems = VT.getVectorNumElements(); + if (NumElems % 2 != 0) + return false; + + const unsigned ElementSize = VT.getScalarStoreSize(); + const unsigned WiderElementSize = ElementSize * 2; + if (WiderElementSize > ST.getELen()/8) + return false; + + if (!ST.enableUnalignedVectorMem() && BaseAlign < WiderElementSize) + return false; + + for (unsigned i = 0; i < Index->getNumOperands(); i++) { + // TODO: We've found an active bit of UB, and could be + // more aggressive here if desired. + if (Index->getOperand(i)->isUndef()) + return false; + // TODO: This offset check is too strict if we support fully + // misaligned memory operations. + uint64_t C = Index->getConstantOperandVal(i); + if (C % ElementSize != 0) + return false; + if (i % 2 == 0) + continue; + uint64_t Last = Index->getConstantOperandVal(i-1); + if (C != Last + ElementSize) + return false; + } + return true; +} + + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -14020,6 +14066,36 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAG.getVectorShuffle(VT, DL, Load, DAG.getUNDEF(VT), ShuffleMask); return DAG.getMergeValues({Shuffle, Load.getValue(1)}, DL); } + + if (MGN->getExtensionType() == ISD::NON_EXTLOAD && + matchIndexAsWiderOp(VT, Index, MGN->getMask(), + MGN->getMemOperand()->getBaseAlign(), Subtarget)) { + SmallVector NewIndices; + for (unsigned i = 0; i < Index->getNumOperands(); i += 2) + NewIndices.push_back(Index.getOperand(i)); + EVT IndexVT = Index.getValueType() + .getHalfNumVectorElementsVT(*DAG.getContext()); + Index = DAG.getBuildVector(IndexVT, DL, NewIndices); + + unsigned ElementSize = VT.getScalarStoreSize(); + EVT WideScalarVT = MVT::getIntegerVT(ElementSize * 8 * 2); + auto EltCnt = VT.getVectorElementCount(); + assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!"); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), WideScalarVT, + EltCnt.divideCoefficientBy(2)); + SDValue Passthru = DAG.getBitcast(WideVT, MGN->getPassThru()); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + EltCnt.divideCoefficientBy(2)); + SDValue Mask = DAG.getSplat(MaskVT, DL, DAG.getConstant(1, DL, MVT::i1)); + + SDValue Gather = + DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other), WideVT, DL, + {MGN->getChain(), Passthru, Mask, MGN->getBasePtr(), + Index, ScaleOp}, + MGN->getMemOperand(), IndexType, ISD::NON_EXTLOAD); + SDValue Result = DAG.getBitcast(VT, Gather.getValue(0)); + return DAG.getMergeValues({Result, Gather.getValue(1)}, DL); + } break; } case ISD::MSCATTER:{ @@ -14391,6 +14467,24 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, ISD::UNINDEXED, ISD::NON_EXTLOAD); return SDValue(); } + case Intrinsic::riscv_masked_strided_store: { + auto *Store = cast(N); + SDValue Value = N->getOperand(2); + SDValue Base = N->getOperand(3); + SDValue Stride = N->getOperand(4); + SDValue Mask = N->getOperand(5); + + // If the stride is equal to the element size in bytes, we can use + // a masked.store. + const unsigned ElementSize = Value.getValueType().getScalarStoreSize(); + if (auto *StrideC = dyn_cast(Stride); + StrideC && StrideC->getZExtValue() == ElementSize) + return DAG.getMaskedStore(Store->getChain(), DL, Value, Base, + DAG.getUNDEF(XLenVT), Mask, + Store->getMemoryVT(), Store->getMemOperand(), + ISD::UNINDEXED, false); + return SDValue(); + } case Intrinsic::riscv_vcpop: case Intrinsic::riscv_vcpop_mask: case Intrinsic::riscv_vfirst: diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index b42ad269c18de..5584fa8d503db 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1592,22 +1592,6 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) doLocalPostpass(MBB); - // Once we're fully done rewriting all the instructions, do a final pass - // through to check for VSETVLIs which write to an unused destination. - // For the non X0, X0 variant, we can replace the destination register - // with X0 to reduce register pressure. This is really a generic - // optimization which can be applied to any dead def (TODO: generalize). - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == RISCV::PseudoVSETVLI || - MI.getOpcode() == RISCV::PseudoVSETIVLI) { - Register VRegDef = MI.getOperand(0).getReg(); - if (VRegDef != RISCV::X0 && MRI->use_nodbg_empty(VRegDef)) - MI.getOperand(0).setReg(RISCV::X0); - } - } - } - // Insert PseudoReadVL after VLEFF/VLSEGFF and replace it with the vl output // of VLEFF/VLSEGFF. for (MachineBasicBlock &MBB : MF) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index d7314866789ce..c43af14bb7f70 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -129,8 +129,9 @@ let Predicates = [HasAtomicLdSt, IsRV64] in { /// AMOs -multiclass AMOPat { -let Predicates = [HasStdExtA, NotHasStdExtZtso] in { +multiclass AMOPat ExtraPreds = []> { +let Predicates = !listconcat([HasStdExtA, NotHasStdExtZtso], ExtraPreds) in { def : PatGprGpr(AtomicOp#"_monotonic"), !cast(BaseInst), vt>; def : PatGprGpr(AtomicOp#"_acquire"), @@ -142,7 +143,7 @@ let Predicates = [HasStdExtA, NotHasStdExtZtso] in { def : PatGprGpr(AtomicOp#"_seq_cst"), !cast(BaseInst#"_AQ_RL"), vt>; } -let Predicates = [HasStdExtA, HasStdExtZtso] in { +let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in { def : PatGprGpr(AtomicOp#"_monotonic"), !cast(BaseInst), vt>; def : PatGprGpr(AtomicOp#"_acquire"), @@ -156,8 +157,6 @@ let Predicates = [HasStdExtA, HasStdExtZtso] in { } } -let Predicates = [HasStdExtA] in { - defm : AMOPat<"atomic_swap_32", "AMOSWAP_W">; defm : AMOPat<"atomic_load_add_32", "AMOADD_W">; defm : AMOPat<"atomic_load_and_32", "AMOAND_W">; @@ -168,6 +167,8 @@ defm : AMOPat<"atomic_load_min_32", "AMOMIN_W">; defm : AMOPat<"atomic_load_umax_32", "AMOMAXU_W">; defm : AMOPat<"atomic_load_umin_32", "AMOMINU_W">; +let Predicates = [HasStdExtA] in { + /// Pseudo AMOs class PseudoAMO : Pseudo<(outs GPR:$res, GPR:$scratch), @@ -315,17 +316,17 @@ def : Pat<(int_riscv_masked_cmpxchg_i32 } // Predicates = [HasStdExtA] -let Predicates = [HasStdExtA, IsRV64] in { +defm : AMOPat<"atomic_swap_64", "AMOSWAP_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_add_64", "AMOADD_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_and_64", "AMOAND_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_or_64", "AMOOR_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_xor_64", "AMOXOR_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_max_64", "AMOMAX_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_min_64", "AMOMIN_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_umax_64", "AMOMAXU_D", i64, [IsRV64]>; +defm : AMOPat<"atomic_load_umin_64", "AMOMINU_D", i64, [IsRV64]>; -defm : AMOPat<"atomic_swap_64", "AMOSWAP_D", i64>; -defm : AMOPat<"atomic_load_add_64", "AMOADD_D", i64>; -defm : AMOPat<"atomic_load_and_64", "AMOAND_D", i64>; -defm : AMOPat<"atomic_load_or_64", "AMOOR_D", i64>; -defm : AMOPat<"atomic_load_xor_64", "AMOXOR_D", i64>; -defm : AMOPat<"atomic_load_max_64", "AMOMAX_D", i64>; -defm : AMOPat<"atomic_load_min_64", "AMOMIN_D", i64>; -defm : AMOPat<"atomic_load_umax_64", "AMOMAXU_D", i64>; -defm : AMOPat<"atomic_load_umin_64", "AMOMINU_D", i64>; +let Predicates = [HasStdExtA, IsRV64] in { /// 64-bit pseudo AMOs diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 01291001cd7ca..402ec20fe39ab 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -201,3 +201,25 @@ def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max", FeatureStdExtM, FeatureStdExtC], [TuneNoDefaultUnroll]>; + +def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1", + NoSchedModel, + [Feature64Bit, + FeatureStdExtZifencei, + FeatureStdExtZicsr, + FeatureStdExtZicntr, + FeatureStdExtZihpm, + FeatureStdExtZihintpause, + FeatureStdExtM, + FeatureStdExtA, + FeatureStdExtF, + FeatureStdExtD, + FeatureStdExtC, + FeatureStdExtZba, + FeatureStdExtZbb, + FeatureStdExtZbc, + FeatureStdExtZbs, + FeatureStdExtZicbom, + FeatureStdExtZicbop, + FeatureStdExtZicboz, + FeatureVendorXVentanaCondOps]>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 31119a403118f..30d559a4958c3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -71,6 +71,13 @@ static cl::opt EnableRISCVCopyPropagation( cl::desc("Enable the copy propagation with RISC-V copy instr"), cl::init(true), cl::Hidden); +static cl::opt EnableRISCVDeadRegisterElimination( + "riscv-enable-dead-defs", cl::Hidden, + cl::desc("Enable the pass that removes dead" + " definitons and replaces stores to" + " them with stores to x0"), + cl::init(true)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { RegisterTargetMachine X(getTheRISCV32Target()); RegisterTargetMachine Y(getTheRISCV64Target()); @@ -79,6 +86,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVO0PreLegalizerCombinerPass(*PR); initializeRISCVPreLegalizerCombinerPass(*PR); initializeKCFIPass(*PR); + initializeRISCVDeadRegisterDefinitionsPass(*PR); initializeRISCVMakeCompressibleOptPass(*PR); initializeRISCVGatherScatterLoweringPass(*PR); initializeRISCVCodeGenPreparePass(*PR); @@ -401,6 +409,9 @@ void RISCVPassConfig::addPreRegAlloc() { if (TM->getOptLevel() != CodeGenOptLevel::None) addPass(createRISCVMergeBaseOffsetOptPass()); addPass(createRISCVInsertVSETVLIPass()); + if (TM->getOptLevel() != CodeGenOptLevel::None && + EnableRISCVDeadRegisterElimination) + addPass(createRISCVDeadRegisterDefinitionsPass()); addPass(createRISCVInsertReadWriteCSRPass()); } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp index 020cfa6ee20fa..1af7b7a5d7845 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp @@ -217,8 +217,7 @@ getExtInstSetFromString(std::string SetName) { std::string getExtInstName(SPIRV::InstructionSet::InstructionSet Set, uint32_t InstructionNumber) { const SPIRV::ExtendedBuiltin *Lookup = - SPIRV::lookupExtendedBuiltinBySetAndNumber( - SPIRV::InstructionSet::OpenCL_std, InstructionNumber); + SPIRV::lookupExtendedBuiltinBySetAndNumber(Set, InstructionNumber); if (!Lookup) return "UNKNOWN_EXT_INST"; diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h index 00553d9710b69..96cc621791e97 100644 --- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h @@ -59,6 +59,7 @@ struct SpecialTypeDescriptor { STK_Sampler, STK_Pipe, STK_DeviceEvent, + STK_Pointer, STK_Last = -1 }; SpecialTypeKind Kind; @@ -160,6 +161,23 @@ struct DeviceEventTypeDescriptor : public SpecialTypeDescriptor { return TD->Kind == SpecialTypeKind::STK_DeviceEvent; } }; + +struct PointerTypeDescriptor : public SpecialTypeDescriptor { + const Type *ElementType; + unsigned AddressSpace; + + PointerTypeDescriptor() = delete; + PointerTypeDescriptor(const Type *ElementType, unsigned AddressSpace) + : SpecialTypeDescriptor(SpecialTypeKind::STK_Pointer), + ElementType(ElementType), AddressSpace(AddressSpace) { + Hash = (DenseMapInfo().getHashValue(ElementType) & 0xffff) ^ + ((AddressSpace << 8) | Kind); + } + + static bool classof(const SpecialTypeDescriptor *TD) { + return TD->Kind == SpecialTypeKind::STK_Pointer; + } +}; } // namespace SPIRV template <> struct DenseMapInfo { @@ -262,8 +280,14 @@ class SPIRVGeneralDuplicatesTracker { void buildDepsGraph(std::vector &Graph, MachineModuleInfo *MMI); - void add(const Type *T, const MachineFunction *MF, Register R) { - TT.add(T, MF, R); + void add(const Type *Ty, const MachineFunction *MF, Register R) { + TT.add(Ty, MF, R); + } + + void add(const Type *PointerElementType, unsigned AddressSpace, + const MachineFunction *MF, Register R) { + ST.add(SPIRV::PointerTypeDescriptor(PointerElementType, AddressSpace), MF, + R); } void add(const Constant *C, const MachineFunction *MF, Register R) { @@ -287,8 +311,14 @@ class SPIRVGeneralDuplicatesTracker { ST.add(TD, MF, R); } - Register find(const Type *T, const MachineFunction *MF) { - return TT.find(const_cast(T), MF); + Register find(const Type *Ty, const MachineFunction *MF) { + return TT.find(const_cast(Ty), MF); + } + + Register find(const Type *PointerElementType, unsigned AddressSpace, + const MachineFunction *MF) { + return ST.find( + SPIRV::PointerTypeDescriptor(PointerElementType, AddressSpace), MF); } Register find(const Constant *C, const MachineFunction *MF) { diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index fbe5a542693f2..a05e108b8591a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -58,14 +58,21 @@ class SPIRVEmitIntrinsics void preprocessCompositeConstants(); void preprocessUndefs(); CallInst *buildIntrWithMD(Intrinsic::ID IntrID, ArrayRef Types, - Value *Arg, Value *Arg2) { + Value *Arg, Value *Arg2, + ArrayRef Imms) { ConstantAsMetadata *CM = ValueAsMetadata::getConstant(Arg); MDTuple *TyMD = MDNode::get(F->getContext(), CM); MetadataAsValue *VMD = MetadataAsValue::get(F->getContext(), TyMD); - return IRB->CreateIntrinsic(IntrID, {Types}, {Arg2, VMD}); + SmallVector Args; + Args.push_back(Arg2); + Args.push_back(VMD); + for (auto *Imm : Imms) + Args.push_back(Imm); + return IRB->CreateIntrinsic(IntrID, {Types}, Args); } void replaceMemInstrUses(Instruction *Old, Instruction *New); void processInstrAfterVisit(Instruction *I); + void insertAssignPtrTypeIntrs(Instruction *I); void insertAssignTypeIntrs(Instruction *I); void processGlobalValue(GlobalVariable &GV); @@ -121,6 +128,13 @@ static void setInsertPointSkippingPhis(IRBuilder<> &B, Instruction *I) { B.SetInsertPoint(I); } +static bool requireAssignPtrType(Instruction *I) { + if (isa(I) || isa(I)) + return true; + + return false; +} + static bool requireAssignType(Instruction *I) { IntrinsicInst *Intr = dyn_cast(I); if (Intr) { @@ -387,9 +401,31 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV) { IRB->CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV); } +void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I) { + if (I->getType()->isVoidTy() || !requireAssignPtrType(I)) + return; + + setInsertPointSkippingPhis(*IRB, I->getNextNode()); + + Constant *EltTyConst; + unsigned AddressSpace = 0; + if (auto *AI = dyn_cast(I)) { + EltTyConst = Constant::getNullValue(AI->getAllocatedType()); + AddressSpace = AI->getAddressSpace(); + } else if (auto *GEP = dyn_cast(I)) { + EltTyConst = Constant::getNullValue(GEP->getSourceElementType()); + AddressSpace = GEP->getPointerAddressSpace(); + } else { + llvm_unreachable("Unexpected instruction!"); + } + + buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()}, EltTyConst, I, + {IRB->getInt32(AddressSpace)}); +} + void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I) { Type *Ty = I->getType(); - if (!Ty->isVoidTy() && requireAssignType(I)) { + if (!Ty->isVoidTy() && requireAssignType(I) && !requireAssignPtrType(I)) { setInsertPointSkippingPhis(*IRB, I->getNextNode()); Type *TypeToAssign = Ty; if (auto *II = dyn_cast(I)) { @@ -401,7 +437,7 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I) { } } Constant *Const = Constant::getNullValue(TypeToAssign); - buildIntrWithMD(Intrinsic::spv_assign_type, {Ty}, Const, I); + buildIntrWithMD(Intrinsic::spv_assign_type, {Ty}, Const, I, {}); } for (const auto &Op : I->operands()) { if (isa(Op) || isa(Op) || @@ -410,9 +446,10 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I) { setInsertPointSkippingPhis(*IRB, I); if (isa(Op) && Op->getType()->isAggregateType()) buildIntrWithMD(Intrinsic::spv_assign_type, {IRB->getInt32Ty()}, Op, - UndefValue::get(IRB->getInt32Ty())); + UndefValue::get(IRB->getInt32Ty()), {}); else - buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op); + buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op, + {}); } } } @@ -425,8 +462,8 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) { Type *Ty = IRB->getInt32Ty(); auto t = AggrConsts.find(I); assert(t != AggrConsts.end()); - auto *NewOp = - buildIntrWithMD(Intrinsic::spv_track_constant, {Ty, Ty}, t->second, I); + auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant, {Ty, Ty}, + t->second, I, {}); I->replaceAllUsesWith(NewOp); NewOp->setArgOperand(0, I); } @@ -441,7 +478,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) { continue; IRB->SetInsertPoint(I); auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant, - {Op->getType(), Op->getType()}, Op, Op); + {Op->getType(), Op->getType()}, Op, Op, {}); I->setOperand(OpNo, NewOp); } } @@ -484,8 +521,10 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { for (auto &I : instructions(Func)) Worklist.push_back(&I); - for (auto &I : Worklist) + for (auto &I : Worklist) { + insertAssignPtrTypeIntrs(I); insertAssignTypeIntrs(I); + } for (auto *I : Worklist) { TrackConstants = true; diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 8ac90d3ed8574..d68454f26a802 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -638,6 +638,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeFunctionWithArgs( if (Reg.isValid()) return getSPIRVTypeForVReg(Reg); SPIRVType *SpirvType = getOpTypeFunction(RetType, ArgTypes, MIRBuilder); + DT.add(Ty, CurMF, getSPIRVTypeID(SpirvType)); return finishCreatingSPIRVType(Ty, SpirvType); } @@ -748,8 +749,17 @@ SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType( // Do not add OpTypeForwardPointer to DT, a corresponding normal pointer type // will be added later. For special types it is already added to DT. if (SpirvType->getOpcode() != SPIRV::OpTypeForwardPointer && !Reg.isValid() && - !isSpecialOpaqueType(Ty)) - DT.add(Ty, &MIRBuilder.getMF(), getSPIRVTypeID(SpirvType)); + !isSpecialOpaqueType(Ty)) { + if (!Ty->isPointerTy()) + DT.add(Ty, &MIRBuilder.getMF(), getSPIRVTypeID(SpirvType)); + else if (Ty->isOpaquePointerTy()) + DT.add(Type::getInt8Ty(MIRBuilder.getMF().getFunction().getContext()), + Ty->getPointerAddressSpace(), &MIRBuilder.getMF(), + getSPIRVTypeID(SpirvType)); + else + DT.add(Ty->getNonOpaquePointerElementType(), Ty->getPointerAddressSpace(), + &MIRBuilder.getMF(), getSPIRVTypeID(SpirvType)); + } return SpirvType; } @@ -767,7 +777,14 @@ SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const { SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType( const Type *Ty, MachineIRBuilder &MIRBuilder, SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) { - Register Reg = DT.find(Ty, &MIRBuilder.getMF()); + Register Reg; + if (!Ty->isPointerTy()) + Reg = DT.find(Ty, &MIRBuilder.getMF()); + else + Reg = + DT.find(Type::getInt8Ty(MIRBuilder.getMF().getFunction().getContext()), + Ty->getPointerAddressSpace(), &MIRBuilder.getMF()); + if (Reg.isValid() && !isSpecialOpaqueType(Ty)) return getSPIRVTypeForVReg(Reg); TypesInProcessing.clear(); @@ -921,8 +938,9 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeByOpcode( if (ResVReg.isValid()) return MIRBuilder.getMF().getRegInfo().getUniqueVRegDef(ResVReg); ResVReg = createTypeVReg(MIRBuilder); + SPIRVType *SpirvTy = MIRBuilder.buildInstr(Opcode).addDef(ResVReg); DT.add(Ty, &MIRBuilder.getMF(), ResVReg); - return MIRBuilder.buildInstr(Opcode).addDef(ResVReg); + return SpirvTy; } const MachineInstr * @@ -985,7 +1003,6 @@ SPIRVType *SPIRVGlobalRegistry::finishCreatingSPIRVType(const Type *LLVMTy, assert(CurMF == SpirvType->getMF()); VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType; SPIRVToLLVMType[SpirvType] = LLVMTy; - DT.add(LLVMTy, CurMF, getSPIRVTypeID(SpirvType)); return SpirvType; } @@ -1000,6 +1017,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType( .addDef(createTypeVReg(CurMF->getRegInfo())) .addImm(BitWidth) .addImm(0); + DT.add(LLVMTy, CurMF, getSPIRVTypeID(MIB)); return finishCreatingSPIRVType(LLVMTy, MIB); } @@ -1020,6 +1038,7 @@ SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineInstr &I, MachineBasicBlock &BB = *I.getParent(); auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeBool)) .addDef(createTypeVReg(CurMF->getRegInfo())); + DT.add(LLVMTy, CurMF, getSPIRVTypeID(MIB)); return finishCreatingSPIRVType(LLVMTy, MIB); } @@ -1044,6 +1063,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType( .addDef(createTypeVReg(CurMF->getRegInfo())) .addUse(getSPIRVTypeID(BaseType)) .addImm(NumElements); + DT.add(LLVMTy, CurMF, getSPIRVTypeID(MIB)); return finishCreatingSPIRVType(LLVMTy, MIB); } @@ -1062,25 +1082,38 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVArrayType( .addDef(createTypeVReg(CurMF->getRegInfo())) .addUse(getSPIRVTypeID(BaseType)) .addUse(Len); + DT.add(LLVMTy, CurMF, getSPIRVTypeID(MIB)); return finishCreatingSPIRVType(LLVMTy, MIB); } SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType( SPIRVType *BaseType, MachineIRBuilder &MIRBuilder, - SPIRV::StorageClass::StorageClass SClass) { - return getOrCreateSPIRVType( - PointerType::get(const_cast(getTypeForSPIRVType(BaseType)), - storageClassToAddressSpace(SClass)), - MIRBuilder); + SPIRV::StorageClass::StorageClass SC) { + const Type *PointerElementType = getTypeForSPIRVType(BaseType); + unsigned AddressSpace = storageClassToAddressSpace(SC); + Type *LLVMTy = + PointerType::get(const_cast(PointerElementType), AddressSpace); + Register Reg = DT.find(PointerElementType, AddressSpace, CurMF); + if (Reg.isValid()) + return getSPIRVTypeForVReg(Reg); + auto MIB = BuildMI(MIRBuilder.getMBB(), MIRBuilder.getInsertPt(), + MIRBuilder.getDebugLoc(), + MIRBuilder.getTII().get(SPIRV::OpTypePointer)) + .addDef(createTypeVReg(CurMF->getRegInfo())) + .addImm(static_cast(SC)) + .addUse(getSPIRVTypeID(BaseType)); + DT.add(PointerElementType, AddressSpace, CurMF, getSPIRVTypeID(MIB)); + return finishCreatingSPIRVType(LLVMTy, MIB); } SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType( SPIRVType *BaseType, MachineInstr &I, const SPIRVInstrInfo &TII, SPIRV::StorageClass::StorageClass SC) { + const Type *PointerElementType = getTypeForSPIRVType(BaseType); + unsigned AddressSpace = storageClassToAddressSpace(SC); Type *LLVMTy = - PointerType::get(const_cast(getTypeForSPIRVType(BaseType)), - storageClassToAddressSpace(SC)); - Register Reg = DT.find(LLVMTy, CurMF); + PointerType::get(const_cast(PointerElementType), AddressSpace); + Register Reg = DT.find(PointerElementType, AddressSpace, CurMF); if (Reg.isValid()) return getSPIRVTypeForVReg(Reg); MachineBasicBlock &BB = *I.getParent(); @@ -1088,6 +1121,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType( .addDef(createTypeVReg(CurMF->getRegInfo())) .addImm(static_cast(SC)) .addUse(getSPIRVTypeID(BaseType)); + DT.add(PointerElementType, AddressSpace, CurMF, getSPIRVTypeID(MIB)); return finishCreatingSPIRVType(LLVMTy, MIB); } diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index bd3b9b0228434..afa34ff3ce1fa 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -1476,9 +1476,21 @@ bool SPIRVInstructionSelector::selectGlobalValue( // FIXME: don't use MachineIRBuilder here, replace it with BuildMI. MachineIRBuilder MIRBuilder(I); const GlobalValue *GV = I.getOperand(1).getGlobal(); - SPIRVType *ResType = GR.getOrCreateSPIRVType( - GV->getType(), MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false); - + Type *GVType = GV->getValueType(); + SPIRVType *PointerBaseType; + if (GVType->isArrayTy()) { + SPIRVType *ArrayElementType = + GR.getOrCreateSPIRVType(GVType->getArrayElementType(), MIRBuilder, + SPIRV::AccessQualifier::ReadWrite, false); + PointerBaseType = GR.getOrCreateSPIRVArrayType( + ArrayElementType, GVType->getArrayNumElements(), I, TII); + } else { + PointerBaseType = GR.getOrCreateSPIRVType( + GVType, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false); + } + SPIRVType *ResType = GR.getOrCreateSPIRVPointerType( + PointerBaseType, I, TII, + addressSpaceToStorageClass(GV->getAddressSpace())); std::string GlobalIdent = GV->getGlobalIdentifier(); // We have functions as operands in tests with blocks of instruction e.g. in // transcoding/global_block.ll. These operands are not used and should be @@ -1489,8 +1501,6 @@ bool SPIRVInstructionSelector::selectGlobalValue( MachineBasicBlock &BB = *I.getParent(); Register NewReg = GR.find(ConstVal, GR.CurMF); if (!NewReg.isValid()) { - SPIRVType *SpvBaseTy = GR.getOrCreateSPIRVIntegerType(8, I, TII); - ResType = GR.getOrCreateSPIRVPointerType(SpvBaseTy, I, TII); Register NewReg = ResVReg; GR.add(ConstVal, GR.CurMF, NewReg); return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull)) diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index 065eacf6ad1f1..3754f57ef3ac7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -587,6 +587,9 @@ void RequirementHandler::initAvailableCapabilitiesForOpenCL( void RequirementHandler::initAvailableCapabilitiesForVulkan( const SPIRVSubtarget &ST) { addAvailableCaps({Capability::Shader, Capability::Linkage}); + + // Provided by Vulkan version 1.0. + addAvailableCaps({Capability::Int16, Capability::Int64, Capability::Float64}); } } // namespace SPIRV diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index c0c53170f4622..f4076be2a7b77 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -242,7 +242,20 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, !ReachedBegin;) { MachineInstr &MI = *MII; - if (isSpvIntrinsic(MI, Intrinsic::spv_assign_type)) { + if (isSpvIntrinsic(MI, Intrinsic::spv_assign_ptr_type)) { + Register Reg = MI.getOperand(1).getReg(); + MIB.setInsertPt(*MI.getParent(), MI.getIterator()); + SPIRVType *BaseTy = GR->getOrCreateSPIRVType( + getMDOperandAsType(MI.getOperand(2).getMetadata(), 0), MIB); + SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( + BaseTy, MI, *MF.getSubtarget().getInstrInfo(), + addressSpaceToStorageClass(MI.getOperand(3).getImm())); + MachineInstr *Def = MRI.getVRegDef(Reg); + assert(Def && "Expecting an instruction that defines the register"); + insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB, + MF.getRegInfo()); + ToErase.push_back(&MI); + } else if (isSpvIntrinsic(MI, Intrinsic::spv_assign_type)) { Register Reg = MI.getOperand(1).getReg(); Type *Ty = getMDOperandAsType(MI.getOperand(2).getMetadata(), 0); MachineInstr *Def = MRI.getVRegDef(Reg); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index b8f93075ea625..5e84fff351b23 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -1290,9 +1290,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // setjmpTable = (int *) malloc(40); Type *IntPtrTy = getAddrIntType(&M); Constant *size = ConstantInt::get(IntPtrTy, 40); - Instruction *SetjmpTable = - CallInst::CreateMalloc(SetjmpTableSize, IntPtrTy, IRB.getInt32Ty(), size, - nullptr, nullptr, "setjmpTable"); + IRB.SetInsertPoint(SetjmpTableSize); + auto *SetjmpTable = IRB.CreateMalloc(IntPtrTy, IRB.getInt32Ty(), size, + nullptr, nullptr, "setjmpTable"); SetjmpTable->setDebugLoc(FirstDL); // CallInst::CreateMalloc may return a bitcast instruction if the result types // mismatch. We need to set the debug loc for the original call too. @@ -1301,7 +1301,6 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { MallocCallI->setDebugLoc(FirstDL); } // setjmpTable[0] = 0; - IRB.SetInsertPoint(SetjmpTableSize); IRB.CreateStore(IRB.getInt32(0), SetjmpTable); SetjmpTableInsts.push_back(SetjmpTable); SetjmpTableSizeInsts.push_back(SetjmpTableSize); @@ -1402,14 +1401,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { if (auto *CB = dyn_cast(I)) if (auto Bundle = CB->getOperandBundle(LLVMContext::OB_funclet)) Bundles.push_back(OperandBundleDef(*Bundle)); - auto *Free = CallInst::CreateFree(SetjmpTable, Bundles, I); + IRB.SetInsertPoint(I); + auto *Free = IRB.CreateFree(SetjmpTable, Bundles); Free->setDebugLoc(DL); - // CallInst::CreateFree may create a bitcast instruction if its argument - // types mismatch. We need to set the debug loc for the bitcast too. - if (auto *FreeCallI = dyn_cast(Free)) { - if (auto *BitCastI = dyn_cast(FreeCallI->getArgOperand(0))) - BitCastI->setDebugLoc(DL); - } } // Every call to saveSetjmp can change setjmpTable and setjmpTableSize diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index 2c1f5f66da74b..aa63e2b64d9ea 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -487,7 +487,7 @@ void FunctionSpecializer::promoteConstantStackValues(Function *F) { Value *GV = new GlobalVariable(M, ConstVal->getType(), true, GlobalValue::InternalLinkage, ConstVal, - "funcspec.arg"); + "specialized.arg." + Twine(++NGlobals)); if (ArgOpType != ConstVal->getType()) GV = ConstantExpr::getBitCast(cast(GV), ArgOpType); @@ -719,9 +719,10 @@ void FunctionSpecializer::removeDeadFunctions() { /// Clone the function \p F and remove the ssa_copy intrinsics added by /// the SCCPSolver in the cloned version. -static Function *cloneCandidateFunction(Function *F) { +static Function *cloneCandidateFunction(Function *F, unsigned NSpecs) { ValueToValueMapTy Mappings; Function *Clone = CloneFunction(F, Mappings); + Clone->setName(F->getName() + ".specialized." + Twine(NSpecs)); removeSSACopy(*Clone); return Clone; } @@ -879,7 +880,7 @@ bool FunctionSpecializer::isCandidateFunction(Function *F) { Function *FunctionSpecializer::createSpecialization(Function *F, const SpecSig &S) { - Function *Clone = cloneCandidateFunction(F); + Function *Clone = cloneCandidateFunction(F, Specializations.size() + 1); // The original function does not neccessarily have internal linkage, but the // clone must. diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 311a421de5512..ae3ec7c044163 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -782,12 +782,52 @@ bool llvm::hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) { !DisableWholeProgramVisibility; } +static bool +typeIDVisibleToRegularObj(StringRef TypeID, + function_ref IsVisibleToRegularObj) { + // TypeID for member function pointer type is an internal construct + // and won't exist in IsVisibleToRegularObj. The full TypeID + // will be present and participate in invalidation. + if (TypeID.ends_with(".virtual")) + return false; + + // TypeID that doesn't start with Itanium mangling (_ZTS) will be + // non-externally visible types which cannot interact with + // external native files. See CodeGenModule::CreateMetadataIdentifierImpl. + if (!TypeID.consume_front("_ZTS")) + return false; + + // TypeID is keyed off the type name symbol (_ZTS). However, the native + // object may not contain this symbol if it does not contain a key + // function for the base type and thus only contains a reference to the + // type info (_ZTI). To catch this case we query using the type info + // symbol corresponding to the TypeID. + std::string typeInfo = ("_ZTI" + TypeID).str(); + return IsVisibleToRegularObj(typeInfo); +} + +static bool +skipUpdateDueToValidation(GlobalVariable &GV, + function_ref IsVisibleToRegularObj) { + SmallVector Types; + GV.getMetadata(LLVMContext::MD_type, Types); + + for (auto Type : Types) + if (auto *TypeID = dyn_cast(Type->getOperand(1).get())) + return typeIDVisibleToRegularObj(TypeID->getString(), + IsVisibleToRegularObj); + + return false; +} + /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definitions to linkage unit visibility in /// Module IR (for regular or hybrid LTO). void llvm::updateVCallVisibilityInModule( Module &M, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols) { + const DenseSet &DynamicExportSymbols, + bool ValidateAllVtablesHaveTypeInfos, + function_ref IsVisibleToRegularObj) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; for (GlobalVariable &GV : M.globals()) { @@ -798,7 +838,13 @@ void llvm::updateVCallVisibilityInModule( GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic && // Don't upgrade the visibility for symbols exported to the dynamic // linker, as we have no information on their eventual use. - !DynamicExportSymbols.count(GV.getGUID())) + !DynamicExportSymbols.count(GV.getGUID()) && + // With validation enabled, we want to exclude symbols visible to + // regular objects. Local symbols will be in this group due to the + // current implementation but those with VCallVisibilityTranslationUnit + // will have already been marked in clang so are unaffected. + !(ValidateAllVtablesHaveTypeInfos && + skipUpdateDueToValidation(GV, IsVisibleToRegularObj))) GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -830,12 +876,26 @@ void llvm::updatePublicTypeTestCalls(Module &M, } } +/// Based on typeID string, get all associated vtable GUIDS that are +/// visible to regular objects. +void llvm::getVisibleToRegularObjVtableGUIDs( + ModuleSummaryIndex &Index, + DenseSet &VisibleToRegularObjSymbols, + function_ref IsVisibleToRegularObj) { + for (const auto &typeID : Index.typeIdCompatibleVtableMap()) { + if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj)) + for (const TypeIdOffsetVtableInfo &P : typeID.second) + VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID()); + } +} + /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definition summaries to linkage unit /// visibility in Module summary index (for ThinLTO). void llvm::updateVCallVisibilityInIndex( ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols) { + const DenseSet &DynamicExportSymbols, + const DenseSet &VisibleToRegularObjSymbols) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; for (auto &P : Index) { @@ -848,6 +908,12 @@ void llvm::updateVCallVisibilityInIndex( if (!GVar || GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) continue; + // With validation enabled, we want to exclude symbols visible to regular + // objects. Local symbols will be in this group due to the current + // implementation but those with VCallVisibilityTranslationUnit will have + // already been marked in clang so are unaffected. + if (VisibleToRegularObjSymbols.count(P.first)) + continue; GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -1041,8 +1107,8 @@ bool DevirtModule::tryFindVirtualCallTargets( } bool DevirtIndex::tryFindVirtualCallTargets( - std::vector &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo, - uint64_t ByteOffset) { + std::vector &TargetsForSlot, + const TypeIdCompatibleVtableInfo TIdInfo, uint64_t ByteOffset) { for (const TypeIdOffsetVtableInfo &P : TIdInfo) { // Find a representative copy of the vtable initializer. // We can have multiple available_externally, linkonce_odr and weak_odr diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 291e6382f898c..05b3eaacc7b4d 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1309,45 +1309,28 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, return nullptr; // InstSimplify already performed this fold if it was possible subject to - // current poison-generating flags. Try the transform again with - // poison-generating flags temporarily dropped. - bool WasNUW = false, WasNSW = false, WasExact = false, WasInBounds = false; - if (auto *OBO = dyn_cast(FalseVal)) { - WasNUW = OBO->hasNoUnsignedWrap(); - WasNSW = OBO->hasNoSignedWrap(); - FalseInst->setHasNoUnsignedWrap(false); - FalseInst->setHasNoSignedWrap(false); - } - if (auto *PEO = dyn_cast(FalseVal)) { - WasExact = PEO->isExact(); - FalseInst->setIsExact(false); - } - if (auto *GEP = dyn_cast(FalseVal)) { - WasInBounds = GEP->isInBounds(); - GEP->setIsInBounds(false); - } + // current poison-generating flags. Check whether dropping poison-generating + // flags enables the transform. // Try each equivalence substitution possibility. // We have an 'EQ' comparison, so the select's false value will propagate. // Example: // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1 + SmallVector DropFlags; if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ, - /* AllowRefinement */ false) == TrueVal || + /* AllowRefinement */ false, + &DropFlags) == TrueVal || simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ, - /* AllowRefinement */ false) == TrueVal) { + /* AllowRefinement */ false, + &DropFlags) == TrueVal) { + for (Instruction *I : DropFlags) { + I->dropPoisonGeneratingFlagsAndMetadata(); + Worklist.add(I); + } + return replaceInstUsesWith(Sel, FalseVal); } - // Restore poison-generating flags if the transform did not apply. - if (WasNUW) - FalseInst->setHasNoUnsignedWrap(); - if (WasNSW) - FalseInst->setHasNoSignedWrap(); - if (WasExact) - FalseInst->setIsExact(); - if (WasInBounds) - cast(FalseInst)->setIsInBounds(); - return nullptr; } diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp index a58ab093a1f75..55079b4a42d2f 100644 --- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -967,6 +967,9 @@ bool LoopPredication::isLoopProfitableToPredicate() { Numerator += Weight; Denominator += Weight; } + // If all weights are zero act as if there was no profile data + if (Denominator == 0) + return BranchProbability::getBranchProbability(1, NumSucc); return BranchProbability::getBranchProbability(Numerator, Denominator); } else { assert(LatchBlock != ExitingBlock && diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 7beed0a0e11aa..891f5349bd9b2 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2891,26 +2891,10 @@ class llvm::sroa::AllocaSliceRewriter if (IntTy && V->getType()->isIntegerTy()) return rewriteIntegerStore(V, SI, AATags); - const bool IsStorePastEnd = - DL.getTypeStoreSize(V->getType()).getFixedValue() > SliceSize; StoreInst *NewSI; if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && - (canConvertValue(DL, V->getType(), NewAllocaTy) || - (IsStorePastEnd && NewAllocaTy->isIntegerTy() && - V->getType()->isIntegerTy()))) { - // If this is an integer store past the end of slice (and thus the bytes - // past that point are irrelevant or this is unreachable), truncate the - // value prior to storing. - if (auto *VITy = dyn_cast(V->getType())) - if (auto *AITy = dyn_cast(NewAllocaTy)) - if (VITy->getBitWidth() > AITy->getBitWidth()) { - if (DL.isBigEndian()) - V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(), - "endian_shift"); - V = IRB.CreateTrunc(V, AITy, "load.trunc"); - } - + canConvertValue(DL, V->getType(), NewAllocaTy)) { V = convertValue(DL, IRB, V, NewAllocaTy); Value *NewPtr = getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile()); diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index c8c0823daaa6b..f020637ee186d 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -850,7 +850,7 @@ class ExtTSPImpl { // Attach (a part of) ChainPred after the last node of ChainSucc. for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) { - const NodeT *DstBlock = Jump->Source; + const NodeT *DstBlock = Jump->Target; if (DstBlock->CurChain != ChainPred) continue; size_t Offset = DstBlock->CurIndex; diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll index 3f5c8e4c1c72f..30ffc4cb5e1e8 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll @@ -936,3 +936,126 @@ for.body: exit: ret void } + +define void @forked_ptrs_with_different_base(ptr nocapture readonly %Preds, ptr nocapture %a, ptr nocapture %b, ptr nocapture readonly %c) { +; CHECK: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[G1:.+]]): +; CHECK-NEXT: %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G2:.+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[G1]]): +; CHECK-NEXT: %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G4:.+]]): +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[G3:.+]]): +; CHECK-NEXT: %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G2]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 3: +; CHECK-NEXT: Comparing group ([[G3]]): +; CHECK-NEXT: %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G4]]): +; CHECK-NEXT: %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[G1]]: +; CHECK-NEXT: (Low: %1 High: (63992 + %1)) +; CHECK-NEXT: Member: {%1,+,8}<%for.body> +; CHECK-NEXT: Group [[G3]]: +; CHECK-NEXT: (Low: %2 High: (63992 + %2)) +; CHECK-NEXT: Member: {%2,+,8}<%for.body> +; CHECK-NEXT: Group [[G2]]: +; CHECK-NEXT: (Low: %Preds High: (31996 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group [[G4]]: +; CHECK-NEXT: (Low: %0 High: (63992 + %0)) +; CHECK-NEXT: Member: {%0,+,8}<%for.body> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +entry: + %0 = load ptr, ptr %c, align 64 + %1 = load ptr, ptr %a, align 64 + %2 = load ptr, ptr %b, align 64 + br label %for.body + +for.cond.cleanup: ; preds = %for.inc + ret void + +for.body: ; preds = %entry, %for.inc + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv + %3 = load i32, ptr %arrayidx, align 4 + %cmp2.not = icmp eq i32 %3, 0 + br i1 %cmp2.not, label %if.else, label %if.then + +if.then: ; preds = %for.body + %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv + %4 = load double, ptr %arrayidx5, align 8 + %add = fadd fast double %4, 1.000000e+00 + br label %for.inc + +if.else: ; preds = %for.body + %5 = mul nuw nsw i64 %indvars.iv, %indvars.iv + %6 = trunc i64 %5 to i32 + %conv8 = sitofp i32 %6 to double + br label %for.inc + +for.inc: ; preds = %if.then, %if.else + %.sink = phi ptr [ %1, %if.then ], [ %2, %if.else ] + %add.sink = phi double [ %add, %if.then ], [ %conv8, %if.else ] + %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv + store double %add.sink, ptr %arrayidx7, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 7999 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; Negative test: the operator number of PhiNode is not 2. +define void @forked_ptrs_with_different_base3(ptr nocapture readonly %Preds, ptr nocapture %a, ptr nocapture %b, ptr nocapture readonly %c) { +; CHECK: for.body: +; CHECK-NEXT: Report: cannot identify array bounds +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +entry: + %ld.c = load ptr, ptr %c, align 64 + %ld.a = load ptr, ptr %a, align 64 + %ld.b = load ptr, ptr %b, align 64 + br label %for.body + +for.body: ; preds = %entry, %for.inc + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv + %ld.preds = load i32, ptr %arrayidx, align 4 + switch i32 %ld.preds, label %if.else [ + i32 0, label %if.br0 + i32 1, label %if.br1 + ] + +if.br0: ; preds = %for.body + br label %for.inc + +if.br1: ; preds = %for.body + br label %for.inc + +if.else: ; preds = %for.body + br label %for.inc + +for.inc: ; preds = %if.br1, %if.br0 + %.sink = phi ptr [ %ld.a, %if.br0 ], [ %ld.b, %if.br1 ], [ %ld.c, %if.else ] + %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv + store double 1.000000e+00, ptr %arrayidx7, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 7999 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.inc + ret void +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll index bf420700eb575..55750ab34e17a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -14,8 +14,8 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) { ; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s0, v0 ; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: ucvtf.2s v1, v1 -; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: ucvtf.2s v0, v1 +; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret entry: @@ -52,8 +52,8 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) { ; CHECK-NEXT: uaddlv.8h s1, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] ; CHECK-NEXT: mov.s v2[0], v1[0] -; CHECK-NEXT: ucvtf.4s v2, v2 -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret entry: @@ -76,8 +76,8 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) { ; CHECK-NEXT: st1.s { v0 }[2], [x8] ; CHECK-NEXT: str d0, [x0, #80] ; CHECK-NEXT: mov.s v2[0], v1[0] -; CHECK-NEXT: ucvtf.4s v2, v2 -; CHECK-NEXT: str q2, [x0] +; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret entry: @@ -256,9 +256,9 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) { ; CHECK-NEXT: uaddlv.4h s1, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] ; CHECK-NEXT: mov.s v2[0], v1[0] -; CHECK-NEXT: ucvtf.2d v2, v2 -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: ucvtf.2d v1, v2 +; CHECK-NEXT: fcvtn v1.2s, v1.2d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll index e8437b5cd801f..a65c5d6667794 100644 --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -221,69 +221,69 @@ define @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) { ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z0.d, x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z7.d, x1 +; CHECK-NEXT: mov z3.d, x1 ; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: mov z4.d, z1.d ; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: uqadd z5.d, z1.d, z0.d +; CHECK-NEXT: uqadd z25.d, z1.d, z0.d ; CHECK-NEXT: incd z1.d, all, mul #8 ; CHECK-NEXT: incd z2.d -; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: incd z4.d, all, mul #2 ; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: cmphi p1.d, p0/z, z7.d, z5.d +; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z25.d ; CHECK-NEXT: uqadd z1.d, z1.d, z0.d -; CHECK-NEXT: mov z4.d, z2.d -; CHECK-NEXT: uqadd z24.d, z2.d, z0.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z27.d, z3.d -; CHECK-NEXT: uqadd z26.d, z3.d, z0.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: uqadd z26.d, z2.d, z0.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z24.d, z4.d +; CHECK-NEXT: uqadd z27.d, z4.d, z0.d ; CHECK-NEXT: uqadd z28.d, z6.d, z0.d ; CHECK-NEXT: incd z2.d, all, mul #8 -; CHECK-NEXT: incd z3.d, all, mul #8 -; CHECK-NEXT: incd z6.d, all, mul #8 -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z25.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z7.d, z24.d -; CHECK-NEXT: incd z27.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z7.d, z26.d -; CHECK-NEXT: cmphi p5.d, p0/z, z7.d, z28.d -; CHECK-NEXT: uqadd z2.d, z2.d, z0.d -; CHECK-NEXT: uqadd z3.d, z3.d, z0.d -; CHECK-NEXT: mov z24.d, z4.d -; CHECK-NEXT: uqadd z5.d, z4.d, z0.d -; CHECK-NEXT: uqadd z26.d, z25.d, z0.d ; CHECK-NEXT: incd z4.d, all, mul #8 -; CHECK-NEXT: incd z25.d, all, mul #8 -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: incd z6.d, all, mul #8 +; CHECK-NEXT: incd z5.d, all, mul #2 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z26.d ; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: cmphi p8.d, p0/z, z7.d, z2.d -; CHECK-NEXT: cmphi p4.d, p0/z, z7.d, z5.d -; CHECK-NEXT: uqadd z5.d, z27.d, z0.d -; CHECK-NEXT: incd z27.d, all, mul #8 +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z27.d +; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z28.d +; CHECK-NEXT: uqadd z2.d, z2.d, z0.d ; CHECK-NEXT: uqadd z4.d, z4.d, z0.d -; CHECK-NEXT: cmphi p6.d, p0/z, z7.d, z26.d -; CHECK-NEXT: uqadd z28.d, z24.d, z0.d +; CHECK-NEXT: uqadd z6.d, z6.d, z0.d +; CHECK-NEXT: mov z26.d, z5.d +; CHECK-NEXT: uqadd z25.d, z5.d, z0.d +; CHECK-NEXT: uqadd z27.d, z7.d, z0.d +; CHECK-NEXT: incd z5.d, all, mul #8 +; CHECK-NEXT: incd z7.d, all, mul #8 +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: incd z26.d, all, mul #4 +; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z2.d +; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z25.d +; CHECK-NEXT: uqadd z25.d, z24.d, z0.d ; CHECK-NEXT: incd z24.d, all, mul #8 +; CHECK-NEXT: uqadd z5.d, z5.d, z0.d +; CHECK-NEXT: uqadd z7.d, z7.d, z0.d +; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z27.d +; CHECK-NEXT: uqadd z28.d, z26.d, z0.d +; CHECK-NEXT: incd z26.d, all, mul #8 ; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s -; CHECK-NEXT: cmphi p7.d, p0/z, z7.d, z5.d -; CHECK-NEXT: uqadd z5.d, z6.d, z0.d -; CHECK-NEXT: uqadd z6.d, z25.d, z0.d -; CHECK-NEXT: uqadd z25.d, z27.d, z0.d -; CHECK-NEXT: cmphi p4.d, p0/z, z7.d, z1.d +; CHECK-NEXT: uqadd z24.d, z24.d, z0.d +; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z25.d +; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z1.d ; CHECK-NEXT: uzp1 p5.s, p5.s, p6.s -; CHECK-NEXT: cmphi p6.d, p0/z, z7.d, z3.d -; CHECK-NEXT: cmphi p9.d, p0/z, z7.d, z4.d -; CHECK-NEXT: uqadd z0.d, z24.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z7.d, z28.d -; CHECK-NEXT: cmphi p10.d, p0/z, z7.d, z6.d +; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z4.d +; CHECK-NEXT: cmphi p9.d, p0/z, z3.d, z5.d +; CHECK-NEXT: cmphi p10.d, p0/z, z3.d, z7.d +; CHECK-NEXT: uqadd z0.d, z26.d, z0.d +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z28.d ; CHECK-NEXT: uzp1 p4.s, p4.s, p8.s -; CHECK-NEXT: cmphi p8.d, p0/z, z7.d, z25.d +; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z24.d ; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s ; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p2.s, p7.s, p2.s -; CHECK-NEXT: cmphi p7.d, p0/z, z7.d, z5.d -; CHECK-NEXT: cmphi p0.d, p0/z, z7.d, z0.d ; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: uzp1 p2.s, p7.s, p2.s +; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z6.d +; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z0.d ; CHECK-NEXT: uzp1 p7.s, p7.s, p10.s ; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: uzp1 p0.s, p8.s, p0.s diff --git a/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll index 091fb7f0c730a..65cc368c0b561 100644 --- a/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll @@ -10,28 +10,28 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define zeroext i8 @fullGtU(i32 %i1, i32 %i2) { ; CHECK-LABEL: fullGtU: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: adrp x8, _block@GOTPAGE +; CHECK-NEXT: adrp x9, _block@GOTPAGE ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: ; kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: sxtw x8, w0 ; CHECK-NEXT: sxtw x10, w1 -; CHECK-NEXT: ldr x8, [x8, _block@GOTPAGEOFF] -; CHECK-NEXT: ldr x8, [x8] -; CHECK-NEXT: ldrb w11, [x8, x9] -; CHECK-NEXT: ldrb w12, [x8, x10] +; CHECK-NEXT: ldr x9, [x9, _block@GOTPAGEOFF] +; CHECK-NEXT: ldr x9, [x9] +; CHECK-NEXT: ldrb w11, [x9, x8] +; CHECK-NEXT: ldrb w12, [x9, x10] ; CHECK-NEXT: cmp w11, w12 ; CHECK-NEXT: b.ne LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %if.end -; CHECK-NEXT: add x9, x9, x8 -; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: ldrb w10, [x9, #1] -; CHECK-NEXT: ldrb w11, [x8, #1] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: add x9, x10, x9 +; CHECK-NEXT: ldrb w10, [x8, #1] +; CHECK-NEXT: ldrb w11, [x9, #1] ; CHECK-NEXT: cmp w10, w11 ; CHECK-NEXT: b.ne LBB0_3 ; CHECK-NEXT: ; %bb.2: ; %if.end25 -; CHECK-NEXT: ldrb w9, [x9, #2] ; CHECK-NEXT: ldrb w8, [x8, #2] -; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: ldrb w9, [x9, #2] +; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w8, hi ; CHECK-NEXT: csel w0, wzr, w8, eq ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-cse.ll b/llvm/test/CodeGen/AArch64/arm64-cse.ll index 7afa30970dff2..3cacc25e02b21 100644 --- a/llvm/test/CodeGen/AArch64/arm64-cse.ll +++ b/llvm/test/CodeGen/AArch64/arm64-cse.ll @@ -8,16 +8,16 @@ target triple = "arm64-apple-ios" define ptr @t1(ptr %base, ptr nocapture %offset, i32 %size) nounwind { ; CHECK-LABEL: t1: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldr w9, [x1] -; CHECK-NEXT: subs w8, w9, w2 +; CHECK-NEXT: ldr w8, [x1] +; CHECK-NEXT: subs w9, w8, w2 ; CHECK-NEXT: b.ge LBB0_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_2: ; %if.end -; CHECK-NEXT: add x0, x0, w8, sxtw -; CHECK-NEXT: sub w9, w9, w8 -; CHECK-NEXT: str w9, [x1] +; CHECK-NEXT: add x0, x0, w9, sxtw +; CHECK-NEXT: sub w8, w8, w9 +; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret entry: %0 = load i32, ptr %offset, align 4 diff --git a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll index 80b8c963a697c..5806bcf0dacf1 100644 --- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -1059,15 +1059,15 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; ENABLE-NEXT: .cfi_offset w27, -88 ; ENABLE-NEXT: .cfi_offset w28, -96 ; ENABLE-NEXT: lsl w8, w1, w0 -; ENABLE-NEXT: lsr w10, w0, w1 -; ENABLE-NEXT: lsl w16, w0, w1 +; ENABLE-NEXT: lsr w9, w0, w1 +; ENABLE-NEXT: lsl w14, w0, w1 ; ENABLE-NEXT: lsr w11, w1, w0 -; ENABLE-NEXT: add w14, w1, w0 -; ENABLE-NEXT: sub w9, w8, w10 +; ENABLE-NEXT: add w15, w1, w0 +; ENABLE-NEXT: sub w10, w8, w9 ; ENABLE-NEXT: subs w17, w1, w0 -; ENABLE-NEXT: add w15, w16, w8 -; ENABLE-NEXT: add w12, w10, w11 -; ENABLE-NEXT: add w13, w11, w14 +; ENABLE-NEXT: add w16, w14, w8 +; ENABLE-NEXT: add w12, w9, w11 +; ENABLE-NEXT: add w13, w11, w15 ; ENABLE-NEXT: b.le LBB14_2 ; ENABLE-NEXT: ; %bb.1: ; %true ; ENABLE-NEXT: str w0, [sp] @@ -1075,14 +1075,14 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; ENABLE-NEXT: nop ; ENABLE-NEXT: ; InlineAsm End ; ENABLE-NEXT: LBB14_2: ; %false -; ENABLE-NEXT: str w16, [x2] +; ENABLE-NEXT: str w14, [x2] ; ENABLE-NEXT: str w8, [x3] -; ENABLE-NEXT: str w10, [x4] +; ENABLE-NEXT: str w9, [x4] ; ENABLE-NEXT: str w11, [x5] -; ENABLE-NEXT: str w14, [x6] +; ENABLE-NEXT: str w15, [x6] ; ENABLE-NEXT: str w17, [x7] ; ENABLE-NEXT: stp w0, w1, [x2, #4] -; ENABLE-NEXT: stp w15, w9, [x2, #12] +; ENABLE-NEXT: stp w16, w10, [x2, #12] ; ENABLE-NEXT: stp w12, w13, [x2, #20] ; ENABLE-NEXT: sub sp, x29, #80 ; ENABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload @@ -1118,15 +1118,15 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; DISABLE-NEXT: .cfi_offset w27, -88 ; DISABLE-NEXT: .cfi_offset w28, -96 ; DISABLE-NEXT: lsl w8, w1, w0 -; DISABLE-NEXT: lsr w10, w0, w1 -; DISABLE-NEXT: lsl w16, w0, w1 +; DISABLE-NEXT: lsr w9, w0, w1 +; DISABLE-NEXT: lsl w14, w0, w1 ; DISABLE-NEXT: lsr w11, w1, w0 -; DISABLE-NEXT: add w14, w1, w0 -; DISABLE-NEXT: sub w9, w8, w10 +; DISABLE-NEXT: add w15, w1, w0 +; DISABLE-NEXT: sub w10, w8, w9 ; DISABLE-NEXT: subs w17, w1, w0 -; DISABLE-NEXT: add w15, w16, w8 -; DISABLE-NEXT: add w12, w10, w11 -; DISABLE-NEXT: add w13, w11, w14 +; DISABLE-NEXT: add w16, w14, w8 +; DISABLE-NEXT: add w12, w9, w11 +; DISABLE-NEXT: add w13, w11, w15 ; DISABLE-NEXT: b.le LBB14_2 ; DISABLE-NEXT: ; %bb.1: ; %true ; DISABLE-NEXT: str w0, [sp] @@ -1134,14 +1134,14 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr ; DISABLE-NEXT: nop ; DISABLE-NEXT: ; InlineAsm End ; DISABLE-NEXT: LBB14_2: ; %false -; DISABLE-NEXT: str w16, [x2] +; DISABLE-NEXT: str w14, [x2] ; DISABLE-NEXT: str w8, [x3] -; DISABLE-NEXT: str w10, [x4] +; DISABLE-NEXT: str w9, [x4] ; DISABLE-NEXT: str w11, [x5] -; DISABLE-NEXT: str w14, [x6] +; DISABLE-NEXT: str w15, [x6] ; DISABLE-NEXT: str w17, [x7] ; DISABLE-NEXT: stp w0, w1, [x2, #4] -; DISABLE-NEXT: stp w15, w9, [x2, #12] +; DISABLE-NEXT: stp w16, w10, [x2, #12] ; DISABLE-NEXT: stp w12, w13, [x2, #20] ; DISABLE-NEXT: sub sp, x29, #80 ; DISABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll index 3b064b718cd67..20adcdf2956d6 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll @@ -9,16 +9,15 @@ define i32 @widget(i64 %arg, <8 x i16> %arg1) { ; CHECK: // %bb.0: // %bb ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: bfi x10, x0, #1, #3 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: bfi x9, x0, #1, #3 ; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: dup v1.8h, w9 -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: ld1 { v1.h }[1], [x10] -; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: str q1, [sp] +; CHECK-NEXT: ld1 { v0.h }[1], [x9] +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index f00265a80e032..01fd2b1113b00 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -26,25 +26,25 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: zip2 p2.d, p1.d, p1.d +; CHECK-NEXT: zip2 p3.d, p1.d, p1.d ; CHECK-NEXT: add x13, x0, x8 ; CHECK-NEXT: add x14, x1, x8 -; CHECK-NEXT: zip1 p3.d, p1.d, p1.d +; CHECK-NEXT: zip1 p2.d, p1.d, p1.d ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: whilelo p1.d, x12, x9 ; CHECK-NEXT: add x8, x8, x11 ; CHECK-NEXT: add x12, x12, x10 -; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p3/z, [x13] -; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p3/z, [x14] +; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] +; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p3/m, z6.d +; CHECK-NEXT: mov z0.d, p3/m, z7.d +; CHECK-NEXT: mov z1.d, p2/m, z6.d ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -237,19 +237,19 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: add x9, x9, x11 ; CHECK-NEXT: add x8, x8, x12 ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 -; CHECK-NEXT: zip2 p2.d, p1.d, p1.d -; CHECK-NEXT: zip1 p3.d, p1.d, p1.d +; CHECK-NEXT: zip2 p3.d, p1.d, p1.d +; CHECK-NEXT: zip1 p2.d, p1.d, p1.d ; CHECK-NEXT: whilelo p1.d, x9, x10 -; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p3/z, [x13] -; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z5.d }, p3/z, [x14] +; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] +; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p3/m, z6.d +; CHECK-NEXT: mov z0.d, p3/m, z7.d +; CHECK-NEXT: mov z1.d, p2/m, z6.d ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll index 40fd7a392c83b..44d0a9392ba62 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll @@ -144,10 +144,10 @@ define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64_unrolled: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -159,24 +159,24 @@ define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q7, q6, [x10] ; CHECK-NEXT: ldp q17, q16, [x9, #32] ; CHECK-NEXT: ldp q19, q18, [x10, #32] -; CHECK-NEXT: fcmla v1.2d, v7.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0 ; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0 -; CHECK-NEXT: fcmla v2.2d, v19.2d, v17.2d, #0 +; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #0 -; CHECK-NEXT: fcmla v1.2d, v7.2d, v5.2d, #90 +; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #90 ; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #90 -; CHECK-NEXT: fcmla v2.2d, v19.2d, v17.2d, #90 +; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #90 ; CHECK-NEXT: fcmla v3.2d, v18.2d, v16.2d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: zip2 v4.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v3.2d, v1.2d, v0.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d +; CHECK-NEXT: zip2 v4.2d, v1.2d, v3.2d +; CHECK-NEXT: zip1 v1.2d, v1.2d, v3.2d +; CHECK-NEXT: zip2 v3.2d, v2.2d, v0.2d +; CHECK-NEXT: zip1 v0.2d, v2.2d, v0.2d +; CHECK-NEXT: fadd v0.2d, v1.2d, v0.2d ; CHECK-NEXT: fadd v1.2d, v4.2d, v3.2d -; CHECK-NEXT: fadd v0.2d, v2.2d, v0.2d -; CHECK-NEXT: faddp d1, v1.2d ; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: faddp d1, v1.2d ; CHECK-NEXT: ret entry: %scevgep = getelementptr i8, ptr %a, i64 32 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index 2cbc8ed3192de..e8d9ec7dc85de 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -217,7 +217,7 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: add x10, sp, #72 ; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ldr s17, [sp, #96] +; CHECK-NEXT: ldr s17, [sp, #104] ; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 @@ -225,21 +225,21 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: ldr s20, [sp, #192] ; CHECK-NEXT: mov v1.s[2], v5.s[0] ; CHECK-NEXT: ld1 { v16.s }[2], [x10] -; CHECK-NEXT: ldr s5, [sp, #104] +; CHECK-NEXT: ldr s5, [sp, #96] ; CHECK-NEXT: ld1 { v3.s }[2], [x9] ; CHECK-NEXT: add x9, sp, #24 ; CHECK-NEXT: add x10, sp, #112 ; CHECK-NEXT: ld1 { v18.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #88 ; CHECK-NEXT: mov v0.s[2], v4.s[0] -; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: ld1 { v5.s }[1], [x10] ; CHECK-NEXT: add x10, sp, #80 ; CHECK-NEXT: ld1 { v16.s }[3], [x9] ; CHECK-NEXT: mov v1.s[3], v7.s[0] ; CHECK-NEXT: add x9, sp, #120 ; CHECK-NEXT: ldr s4, [sp, #128] ; CHECK-NEXT: ld1 { v3.s }[3], [x10] -; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: ld1 { v17.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: ldr s7, [sp] ; CHECK-NEXT: ld1 { v4.s }[1], [x9] @@ -247,8 +247,8 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: add x10, sp, #16 ; CHECK-NEXT: add x9, sp, #160 ; CHECK-NEXT: fmul v6.4s, v16.4s, v1.4s -; CHECK-NEXT: fmul v19.4s, v5.4s, v18.4s -; CHECK-NEXT: fmul v18.4s, v17.4s, v18.4s +; CHECK-NEXT: fmul v19.4s, v17.4s, v18.4s +; CHECK-NEXT: fmul v18.4s, v5.4s, v18.4s ; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: ld1 { v4.s }[2], [x9] @@ -259,21 +259,21 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK-NEXT: ld1 { v20.s }[1], [x10] ; CHECK-NEXT: fneg v6.4s, v6.4s ; CHECK-NEXT: fneg v19.4s, v19.4s -; CHECK-NEXT: fmla v18.4s, v7.4s, v5.4s +; CHECK-NEXT: fmla v18.4s, v7.4s, v17.4s ; CHECK-NEXT: fmla v1.4s, v0.4s, v16.4s ; CHECK-NEXT: ld1 { v4.s }[3], [x9] ; CHECK-NEXT: add x9, sp, #168 ; CHECK-NEXT: ld1 { v2.s }[2], [x9] -; CHECK-NEXT: ldr s5, [sp, #200] +; CHECK-NEXT: ldr s16, [sp, #200] ; CHECK-NEXT: add x9, sp, #216 ; CHECK-NEXT: add x10, sp, #184 ; CHECK-NEXT: fmla v6.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v19.4s, v7.4s, v17.4s -; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: fmla v19.4s, v7.4s, v5.4s +; CHECK-NEXT: ld1 { v16.s }[1], [x9] ; CHECK-NEXT: fsub v0.4s, v4.4s, v1.4s ; CHECK-NEXT: fsub v1.4s, v20.4s, v18.4s ; CHECK-NEXT: ld1 { v2.s }[3], [x10] -; CHECK-NEXT: fadd v3.4s, v5.4s, v19.4s +; CHECK-NEXT: fadd v3.4s, v16.4s, v19.4s ; CHECK-NEXT: fadd v2.4s, v2.4s, v6.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index a93762918cd87..99f573795489a 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -645,27 +645,27 @@ define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: add x11, x1, #12 -; CHECK-NEXT: str s0, [x4] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ldp s1, s5, [x2] +; CHECK-NEXT: str s1, [x4] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: umov w9, v1.h[0] -; CHECK-NEXT: umov w10, v1.h[1] -; CHECK-NEXT: mov v2.b[8], w9 -; CHECK-NEXT: umov w9, v1.h[2] -; CHECK-NEXT: mov v2.b[9], w10 -; CHECK-NEXT: umov w10, v1.h[3] -; CHECK-NEXT: ldr s1, [x1] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: mov v2.b[10], w9 +; CHECK-NEXT: ldp s0, s5, [x2] +; CHECK-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-NEXT: umov w9, v2.h[0] +; CHECK-NEXT: umov w10, v2.h[1] +; CHECK-NEXT: mov v0.b[8], w9 +; CHECK-NEXT: umov w9, v2.h[2] +; CHECK-NEXT: mov v0.b[9], w10 +; CHECK-NEXT: umov w10, v2.h[3] +; CHECK-NEXT: ldr s2, [x1] +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: mov v0.b[10], w9 ; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov v2.b[11], w10 +; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b +; CHECK-NEXT: mov v0.b[11], w10 ; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ld1 { v2.s }[3], [x3], #4 +; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 ; CHECK-NEXT: ldr s4, [x0, #12] ; CHECK-NEXT: ldp s3, s16, [x0, #4] ; CHECK-NEXT: ldp s6, s7, [x2, #8] @@ -676,19 +676,19 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: add x8, x1, #8 ; CHECK-NEXT: ld1 { v16.s }[1], [x8] -; CHECK-NEXT: uaddl v1.8h, v3.8b, v4.8b +; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b ; CHECK-NEXT: ushll v3.8h, v6.8b, #0 ; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b -; CHECK-NEXT: uaddl v5.8h, v0.8b, v16.8b -; CHECK-NEXT: uaddw2 v2.8h, v3.8h, v2.16b -; CHECK-NEXT: ushll v0.4s, v1.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 +; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b +; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b +; CHECK-NEXT: ushll v0.4s, v2.4h, #3 +; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 ; CHECK-NEXT: ushll v6.4s, v4.4h, #3 ; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3 -; CHECK-NEXT: uaddw v0.4s, v0.4s, v5.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v5.8h -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h -; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v5.8h +; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p store <4 x i8> %lp1, ptr %z @@ -757,39 +757,39 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_shuffle: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s2, s3, [x0, #8] +; CHECK-NEXT: ldp s2, s7, [x0, #8] ; CHECK-NEXT: add x8, x3, #8 -; CHECK-NEXT: ldr s16, [x1, #12] +; CHECK-NEXT: ldr s18, [x1, #12] ; CHECK-NEXT: ldp s0, s1, [x2] -; CHECK-NEXT: ldp s6, s7, [x0] +; CHECK-NEXT: ldp s3, s16, [x0] ; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: mov v4.16b, v3.16b -; CHECK-NEXT: ldp s17, s18, [x2, #8] +; CHECK-NEXT: mov v4.16b, v7.16b +; CHECK-NEXT: ldp s6, s17, [x2, #8] ; CHECK-NEXT: ldr s5, [x3, #12] -; CHECK-NEXT: mov v3.s[1], v16.s[0] +; CHECK-NEXT: mov v7.s[1], v18.s[0] ; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: mov v4.s[1], v16.s[0] -; CHECK-NEXT: ld1 { v6.s }[1], [x1], #4 +; CHECK-NEXT: mov v4.s[1], v18.s[0] +; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] ; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v7.s }[1], [x1] -; CHECK-NEXT: mov v4.s[2], v18.s[0] -; CHECK-NEXT: mov v18.s[1], v5.s[0] -; CHECK-NEXT: uaddl v2.8h, v6.8b, v2.8b -; CHECK-NEXT: uaddl v6.8h, v0.8b, v17.8b -; CHECK-NEXT: uaddl v3.8h, v7.8b, v3.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v18.8b +; CHECK-NEXT: ld1 { v16.s }[1], [x1] +; CHECK-NEXT: mov v4.s[2], v17.s[0] +; CHECK-NEXT: mov v17.s[1], v5.s[0] +; CHECK-NEXT: uaddl v2.8h, v3.8b, v2.8b +; CHECK-NEXT: uaddl v6.8h, v0.8b, v6.8b +; CHECK-NEXT: uaddl v7.8h, v16.8b, v7.8b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v17.8b ; CHECK-NEXT: mov v4.s[3], v5.s[0] -; CHECK-NEXT: ushll v0.4s, v3.4h, #3 -; CHECK-NEXT: ushll v7.4s, v1.4h, #3 -; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 -; CHECK-NEXT: ushll2 v1.4s, v3.8h, #3 +; CHECK-NEXT: ushll v0.4s, v7.4h, #3 +; CHECK-NEXT: ushll v16.4s, v1.4h, #3 +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v1.4s, v7.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h ; CHECK-NEXT: str q4, [x4] ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v6.8h -; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v6.8h +; CHECK-NEXT: uaddw v2.4s, v16.4s, v6.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -858,37 +858,37 @@ define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_ext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x2] +; CHECK-NEXT: ldp s1, s2, [x2] ; CHECK-NEXT: add x8, x3, #8 -; CHECK-NEXT: ldp s2, s3, [x0] +; CHECK-NEXT: ldp s3, s5, [x0] ; CHECK-NEXT: add x9, x1, #8 ; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ldp s4, s5, [x2, #8] -; CHECK-NEXT: ldp s6, s7, [x0, #8] +; CHECK-NEXT: ldp s6, s0, [x2, #8] +; CHECK-NEXT: ldp s7, s4, [x0, #8] ; CHECK-NEXT: add x11, x1, #12 -; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v5.s }[1], [x10] -; CHECK-NEXT: ld1 { v7.s }[1], [x11] -; CHECK-NEXT: ld1 { v6.s }[1], [x9] -; CHECK-NEXT: ld1 { v4.s }[1], [x8] -; CHECK-NEXT: ld1 { v1.s }[1], [x3] -; CHECK-NEXT: ld1 { v3.s }[1], [x1] -; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b -; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: ld1 { v1.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v0.s }[1], [x10] +; CHECK-NEXT: ld1 { v4.s }[1], [x11] +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ld1 { v2.s }[1], [x3] +; CHECK-NEXT: ld1 { v5.s }[1], [x1] +; CHECK-NEXT: ushll v16.8h, v0.8b, #0 ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: ushll v6.4s, v1.4h, #3 -; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 -; CHECK-NEXT: ushll v0.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v3.8h, #3 -; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v4.8h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h -; CHECK-NEXT: ushll v4.8h, v7.8b, #0 -; CHECK-NEXT: stp q4, q5, [x4] +; CHECK-NEXT: uaddl v6.8h, v1.8b, v6.8b +; CHECK-NEXT: uaddl v2.8h, v2.8b, v0.8b +; CHECK-NEXT: uaddl v5.8h, v5.8b, v4.8b +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ushll v7.4s, v2.4h, #3 +; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 +; CHECK-NEXT: stp q4, q16, [x4] +; CHECK-NEXT: ushll v1.4s, v5.4h, #3 +; CHECK-NEXT: ushll2 v5.4s, v5.8h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v3.4h +; CHECK-NEXT: uaddw2 v1.4s, v5.4s, v3.8h +; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v6.8h +; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 diff --git a/llvm/test/CodeGen/AArch64/faddp-half.ll b/llvm/test/CodeGen/AArch64/faddp-half.ll index 6a450881dc978..6068a4742eea9 100644 --- a/llvm/test/CodeGen/AArch64/faddp-half.ll +++ b/llvm/test/CodeGen/AArch64/faddp-half.ll @@ -223,15 +223,15 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16: // %bb.0: // %entry ; CHECKNOFP16-NEXT: rev32 v5.8h, v0.8h ; CHECKNOFP16-NEXT: rev32 v4.8h, v1.8h -; CHECKNOFP16-NEXT: mov h2, v0.h[1] +; CHECKNOFP16-NEXT: mov h3, v0.h[1] ; CHECKNOFP16-NEXT: mov h6, v1.h[1] ; CHECKNOFP16-NEXT: fcvt s16, h0 ; CHECKNOFP16-NEXT: mov h17, v0.h[2] ; CHECKNOFP16-NEXT: fcvt s20, h1 ; CHECKNOFP16-NEXT: mov h21, v1.h[2] -; CHECKNOFP16-NEXT: mov h3, v5.h[1] +; CHECKNOFP16-NEXT: mov h2, v5.h[1] ; CHECKNOFP16-NEXT: mov h7, v4.h[1] -; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s3, h3 ; CHECKNOFP16-NEXT: fcvt s18, h5 ; CHECKNOFP16-NEXT: mov h19, v5.h[2] ; CHECKNOFP16-NEXT: fcvt s6, h6 @@ -241,7 +241,7 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16-NEXT: mov h24, v5.h[3] ; CHECKNOFP16-NEXT: fcvt s21, h21 ; CHECKNOFP16-NEXT: mov h25, v4.h[6] -; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s2, h2 ; CHECKNOFP16-NEXT: fcvt s7, h7 ; CHECKNOFP16-NEXT: fadd s16, s18, s16 ; CHECKNOFP16-NEXT: fcvt s18, h19 @@ -249,7 +249,7 @@ define <16 x half> @addp_v16f16(<16 x half> %a) { ; CHECKNOFP16-NEXT: fadd s20, s22, s20 ; CHECKNOFP16-NEXT: fcvt s22, h23 ; CHECKNOFP16-NEXT: mov h23, v4.h[3] -; CHECKNOFP16-NEXT: fadd s3, s3, s2 +; CHECKNOFP16-NEXT: fadd s3, s2, s3 ; CHECKNOFP16-NEXT: fadd s6, s7, s6 ; CHECKNOFP16-NEXT: mov h7, v1.h[3] ; CHECKNOFP16-NEXT: fcvt h2, s16 diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll index bfe8d173435c4..b5b9055fbc02f 100644 --- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll +++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll @@ -498,7 +498,7 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) { ; CHECK-NO16: // %bb.0: ; CHECK-NO16-NEXT: mov h2, v0.h[4] ; CHECK-NO16-NEXT: mov h3, v0.h[5] -; CHECK-NO16-NEXT: mov w9, #32767 // =0x7fff +; CHECK-NO16-NEXT: mov w8, #32767 // =0x7fff ; CHECK-NO16-NEXT: mov h4, v0.h[6] ; CHECK-NO16-NEXT: fmov s1, #4.00000000 ; CHECK-NO16-NEXT: mov w11, #-32768 // =0xffff8000 @@ -512,82 +512,82 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) { ; CHECK-NO16-NEXT: fcvt s4, h4 ; CHECK-NO16-NEXT: fcvt s5, h5 ; CHECK-NO16-NEXT: fcvt s6, h6 -; CHECK-NO16-NEXT: fcvt s0, h0 ; CHECK-NO16-NEXT: fmul s2, s2, s1 ; CHECK-NO16-NEXT: fmul s3, s3, s1 ; CHECK-NO16-NEXT: fmul s4, s4, s1 ; CHECK-NO16-NEXT: fmul s5, s5, s1 ; CHECK-NO16-NEXT: fmul s6, s6, s1 -; CHECK-NO16-NEXT: fmul s0, s0, s1 ; CHECK-NO16-NEXT: fcvt h2, s2 ; CHECK-NO16-NEXT: fcvt h3, s3 ; CHECK-NO16-NEXT: fcvt h4, s4 ; CHECK-NO16-NEXT: fcvt h5, s5 ; CHECK-NO16-NEXT: fcvt h6, s6 -; CHECK-NO16-NEXT: fcvt h0, s0 ; CHECK-NO16-NEXT: mov v2.h[1], v3.h[0] ; CHECK-NO16-NEXT: fcvt s3, h7 ; CHECK-NO16-NEXT: fmul s7, s16, s1 ; CHECK-NO16-NEXT: mov v2.h[2], v4.h[0] +; CHECK-NO16-NEXT: fcvt s4, h0 ; CHECK-NO16-NEXT: fmul s3, s3, s1 -; CHECK-NO16-NEXT: fcvt h4, s7 +; CHECK-NO16-NEXT: fcvt h0, s7 ; CHECK-NO16-NEXT: mov v2.h[3], v5.h[0] -; CHECK-NO16-NEXT: fcvt h1, s3 -; CHECK-NO16-NEXT: mov v4.h[1], v6.h[0] +; CHECK-NO16-NEXT: fmul s1, s4, s1 +; CHECK-NO16-NEXT: fcvt h3, s3 +; CHECK-NO16-NEXT: mov v0.h[1], v6.h[0] ; CHECK-NO16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-NO16-NEXT: mov v4.h[2], v1.h[0] -; CHECK-NO16-NEXT: mov s3, v2.s[1] -; CHECK-NO16-NEXT: mov v4.h[3], v0.h[0] -; CHECK-NO16-NEXT: mov s0, v2.s[2] +; CHECK-NO16-NEXT: fcvt h1, s1 +; CHECK-NO16-NEXT: mov v0.h[2], v3.h[0] +; CHECK-NO16-NEXT: mov s4, v2.s[1] ; CHECK-NO16-NEXT: fcvtzs w10, s2 +; CHECK-NO16-NEXT: mov v0.h[3], v1.h[0] +; CHECK-NO16-NEXT: mov s1, v2.s[2] ; CHECK-NO16-NEXT: mov s2, v2.s[3] -; CHECK-NO16-NEXT: fcvtzs w8, s3 -; CHECK-NO16-NEXT: fcvtl v1.4s, v4.4h -; CHECK-NO16-NEXT: fcvtzs w12, s0 +; CHECK-NO16-NEXT: fcvtzs w9, s4 +; CHECK-NO16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NO16-NEXT: fcvtzs w12, s1 ; CHECK-NO16-NEXT: fcvtzs w13, s2 -; CHECK-NO16-NEXT: cmp w8, w9 -; CHECK-NO16-NEXT: mov s0, v1.s[1] -; CHECK-NO16-NEXT: fcvtzs w15, s1 -; CHECK-NO16-NEXT: csel w8, w8, w9, lt -; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: csel w8, w8, w11, gt -; CHECK-NO16-NEXT: cmp w10, w9 -; CHECK-NO16-NEXT: csel w10, w10, w9, lt -; CHECK-NO16-NEXT: fcvtzs w14, s0 -; CHECK-NO16-NEXT: mov s0, v1.s[2] +; CHECK-NO16-NEXT: cmp w9, w8 +; CHECK-NO16-NEXT: csel w9, w9, w8, lt +; CHECK-NO16-NEXT: mov s1, v0.s[1] +; CHECK-NO16-NEXT: fcvtzs w15, s0 +; CHECK-NO16-NEXT: cmn w9, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: csel w9, w9, w11, gt +; CHECK-NO16-NEXT: cmp w10, w8 +; CHECK-NO16-NEXT: csel w10, w10, w8, lt ; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: fcvtzs w14, s1 +; CHECK-NO16-NEXT: mov s1, v0.s[2] ; CHECK-NO16-NEXT: csel w10, w10, w11, gt -; CHECK-NO16-NEXT: cmp w12, w9 -; CHECK-NO16-NEXT: csel w12, w12, w9, lt +; CHECK-NO16-NEXT: cmp w12, w8 +; CHECK-NO16-NEXT: mov s0, v0.s[3] +; CHECK-NO16-NEXT: csel w12, w12, w8, lt ; CHECK-NO16-NEXT: cmn w12, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: fcvtzs w16, s0 -; CHECK-NO16-NEXT: mov s0, v1.s[3] ; CHECK-NO16-NEXT: csel w12, w12, w11, gt -; CHECK-NO16-NEXT: cmp w13, w9 +; CHECK-NO16-NEXT: cmp w13, w8 +; CHECK-NO16-NEXT: fcvtzs w16, s1 +; CHECK-NO16-NEXT: csel w13, w13, w8, lt ; CHECK-NO16-NEXT: fmov s1, w10 -; CHECK-NO16-NEXT: csel w13, w13, w9, lt ; CHECK-NO16-NEXT: cmn w13, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w13, w13, w11, gt -; CHECK-NO16-NEXT: cmp w14, w9 -; CHECK-NO16-NEXT: mov v1.s[1], w8 -; CHECK-NO16-NEXT: csel w14, w14, w9, lt -; CHECK-NO16-NEXT: fcvtzs w8, s0 +; CHECK-NO16-NEXT: cmp w14, w8 +; CHECK-NO16-NEXT: csel w14, w14, w8, lt +; CHECK-NO16-NEXT: mov v1.s[1], w9 +; CHECK-NO16-NEXT: fcvtzs w9, s0 ; CHECK-NO16-NEXT: cmn w14, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w14, w14, w11, gt -; CHECK-NO16-NEXT: cmp w15, w9 -; CHECK-NO16-NEXT: csel w15, w15, w9, lt -; CHECK-NO16-NEXT: mov v1.s[2], w12 +; CHECK-NO16-NEXT: cmp w15, w8 +; CHECK-NO16-NEXT: csel w15, w15, w8, lt ; CHECK-NO16-NEXT: cmn w15, #8, lsl #12 // =32768 +; CHECK-NO16-NEXT: mov v1.s[2], w12 ; CHECK-NO16-NEXT: csel w10, w15, w11, gt -; CHECK-NO16-NEXT: cmp w16, w9 +; CHECK-NO16-NEXT: cmp w16, w8 ; CHECK-NO16-NEXT: fmov s2, w10 -; CHECK-NO16-NEXT: csel w10, w16, w9, lt +; CHECK-NO16-NEXT: csel w10, w16, w8, lt ; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NO16-NEXT: mov v1.s[3], w13 ; CHECK-NO16-NEXT: csel w10, w10, w11, gt -; CHECK-NO16-NEXT: cmp w8, w9 +; CHECK-NO16-NEXT: cmp w9, w8 +; CHECK-NO16-NEXT: mov v1.s[3], w13 ; CHECK-NO16-NEXT: mov v2.s[1], w14 -; CHECK-NO16-NEXT: csel w8, w8, w9, lt +; CHECK-NO16-NEXT: csel w8, w9, w8, lt ; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NO16-NEXT: csel w8, w8, w11, gt ; CHECK-NO16-NEXT: mov v2.s[2], w10 diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll index 80089288a0365..b7a645bfb546f 100644 --- a/llvm/test/CodeGen/AArch64/fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fdiv.ll @@ -428,10 +428,10 @@ define <16 x half> @fdiv_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[2] ; CHECK-SD-NOFP16-NEXT: fcvt h4, s4 ; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 -; CHECK-SD-NOFP16-NEXT: fdiv s6, s7, s6 -; CHECK-SD-NOFP16-NEXT: mov h7, v2.h[3] -; CHECK-SD-NOFP16-NEXT: fcvt s7, h7 -; CHECK-SD-NOFP16-NEXT: fdiv s7, s16, s7 +; CHECK-SD-NOFP16-NEXT: fdiv s7, s7, s6 +; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[3] +; CHECK-SD-NOFP16-NEXT: fcvt s6, h6 +; CHECK-SD-NOFP16-NEXT: fdiv s6, s16, s6 ; CHECK-SD-NOFP16-NEXT: mov h16, v2.h[4] ; CHECK-SD-NOFP16-NEXT: fcvt s16, h16 ; CHECK-SD-NOFP16-NEXT: fdiv s16, s17, s16 @@ -473,12 +473,12 @@ define <16 x half> @fdiv_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-SD-NOFP16-NEXT: fcvt h2, s20 ; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 ; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s6 +; CHECK-SD-NOFP16-NEXT: fcvt h4, s7 ; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v5.h[0] ; CHECK-SD-NOFP16-NEXT: fcvt h5, s21 ; CHECK-SD-NOFP16-NEXT: fdiv s20, s25, s26 ; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v4.h[0] -; CHECK-SD-NOFP16-NEXT: fcvt h4, s7 +; CHECK-SD-NOFP16-NEXT: fcvt h4, s6 ; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v5.h[0] ; CHECK-SD-NOFP16-NEXT: fcvt h5, s22 ; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v4.h[0] diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll index 9766e22199377..79c99c48ce3dc 100644 --- a/llvm/test/CodeGen/AArch64/fpow.ll +++ b/llvm/test/CodeGen/AArch64/fpow.ll @@ -608,7 +608,7 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov s15, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-GI-NEXT: str s2, [sp, #64] // 4-byte Folded Spill +; CHECK-GI-NEXT: str s2, [sp, #48] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v4.s[2] ; CHECK-GI-NEXT: str s2, [sp, #112] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v3.s[3] @@ -626,17 +626,17 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: fmov s1, s13 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: fmov s1, s12 ; CHECK-GI-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr s0, [sp, #64] // 4-byte Folded Reload +; CHECK-GI-NEXT: ldr s0, [sp, #48] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: fmov s1, s11 -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr s0, [sp, #112] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: str d0, [sp, #112] // 16-byte Folded Spill @@ -649,7 +649,7 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: ldr x30, [sp, #192] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload @@ -657,7 +657,7 @@ define <8 x float> @pow_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -775,15 +775,15 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h8, v0.h[1] -; CHECK-GI-NEXT: mov h9, v0.h[2] -; CHECK-GI-NEXT: mov h10, v0.h[3] -; CHECK-GI-NEXT: mov h11, v0.h[4] +; CHECK-GI-NEXT: mov h9, v0.h[1] +; CHECK-GI-NEXT: mov h10, v0.h[2] +; CHECK-GI-NEXT: mov h11, v0.h[3] +; CHECK-GI-NEXT: mov h12, v0.h[4] ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h15, v1.h[2] -; CHECK-GI-NEXT: mov h13, v1.h[3] -; CHECK-GI-NEXT: mov h12, v1.h[4] -; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h8, v1.h[3] +; CHECK-GI-NEXT: mov h13, v1.h[4] +; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill @@ -793,34 +793,34 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h8 +; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h14 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h9 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h13 +; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h12 +; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #172] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -833,19 +833,20 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] @@ -1079,17 +1080,17 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h10, v0.h[1] -; CHECK-GI-NEXT: mov h11, v0.h[2] -; CHECK-GI-NEXT: mov h12, v0.h[3] -; CHECK-GI-NEXT: mov h13, v0.h[4] +; CHECK-GI-NEXT: mov h11, v0.h[1] +; CHECK-GI-NEXT: mov h12, v0.h[2] +; CHECK-GI-NEXT: mov h13, v0.h[3] +; CHECK-GI-NEXT: mov h14, v0.h[4] ; CHECK-GI-NEXT: mov h8, v1.h[1] ; CHECK-GI-NEXT: mov h9, v1.h[2] -; CHECK-GI-NEXT: mov h15, v1.h[3] -; CHECK-GI-NEXT: mov h14, v1.h[4] +; CHECK-GI-NEXT: mov h10, v1.h[3] +; CHECK-GI-NEXT: mov h15, v1.h[4] ; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] -; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #96] // 2-byte Folded Spill @@ -1101,27 +1102,27 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #190] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h15 +; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf @@ -1133,10 +1134,10 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #188] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -1149,7 +1150,7 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload @@ -1164,7 +1165,7 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] @@ -1367,27 +1368,27 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b13, -64 ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 -; CHECK-GI-NEXT: mov h4, v0.h[4] +; CHECK-GI-NEXT: mov v4.16b, v1.16b ; CHECK-GI-NEXT: str q1, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: mov h11, v0.h[1] -; CHECK-GI-NEXT: mov h12, v0.h[2] -; CHECK-GI-NEXT: mov h13, v0.h[3] +; CHECK-GI-NEXT: mov h1, v0.h[4] +; CHECK-GI-NEXT: mov h12, v0.h[1] +; CHECK-GI-NEXT: mov h13, v0.h[2] ; CHECK-GI-NEXT: str q3, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov h14, v0.h[3] ; CHECK-GI-NEXT: mov h15, v2.h[1] ; CHECK-GI-NEXT: mov h8, v2.h[2] ; CHECK-GI-NEXT: mov h9, v2.h[3] ; CHECK-GI-NEXT: mov h10, v2.h[4] -; CHECK-GI-NEXT: mov h14, v2.h[5] -; CHECK-GI-NEXT: str h4, [sp, #288] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[5] -; CHECK-GI-NEXT: str h4, [sp, #240] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[6] -; CHECK-GI-NEXT: str h4, [sp, #176] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[7] +; CHECK-GI-NEXT: mov h11, v2.h[5] +; CHECK-GI-NEXT: str h1, [sp, #272] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[5] +; CHECK-GI-NEXT: str h1, [sp, #240] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[6] +; CHECK-GI-NEXT: str h1, [sp, #176] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h4, [sp, #144] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov v4.16b, v1.16b -; CHECK-GI-NEXT: mov h1, v1.h[1] +; CHECK-GI-NEXT: str h1, [sp, #144] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v4.h[1] ; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[2] ; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill @@ -1398,7 +1399,7 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: mov h1, v4.h[5] ; CHECK-GI-NEXT: str h1, [sp, #256] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[6] -; CHECK-GI-NEXT: str h1, [sp, #320] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #336] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[7] ; CHECK-GI-NEXT: str h1, [sp, #352] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] @@ -1416,40 +1417,40 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: mov h1, v3.h[5] ; CHECK-GI-NEXT: str h1, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[6] -; CHECK-GI-NEXT: str h1, [sp, #222] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #238] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[7] -; CHECK-GI-NEXT: str h1, [sp, #286] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #302] // 2-byte Folded Spill ; CHECK-GI-NEXT: fcvt s1, h2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 -; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #288] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 ; CHECK-GI-NEXT: fcvt s1, h10 -; CHECK-GI-NEXT: str q0, [sp, #288] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: str q0, [sp, #240] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf @@ -1517,11 +1518,11 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #320] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #336] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #222] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #238] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf @@ -1529,47 +1530,46 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 ; CHECK-GI-NEXT: str q0, [sp, #352] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #286] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h0, [sp, #302] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr q1, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q3, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp, #304] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #432] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #416] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #400] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #384] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #272] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #368] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 ; CHECK-GI-NEXT: ldr q0, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: mov v1.16b, v3.16b +; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #448 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index a36a58660cd40..92fd3183393ea 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -2377,114 +2377,114 @@ define <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) { ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-CVT-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff -; CHECK-CVT-NEXT: mov x11, #-562949953421312 // =0xfffe000000000000 +; CHECK-CVT-NEXT: mov x9, #-562949953421312 // =0xfffe000000000000 ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: fcvt s3, h1 ; CHECK-CVT-NEXT: mov h4, v1.h[2] ; CHECK-CVT-NEXT: mov h1, v1.h[3] ; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvtzs x9, s3 +; CHECK-CVT-NEXT: fcvtzs x10, s3 ; CHECK-CVT-NEXT: fcvt s3, h4 ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: fcvtzs x10, s2 -; CHECK-CVT-NEXT: cmp x9, x8 +; CHECK-CVT-NEXT: fcvtzs x11, s2 +; CHECK-CVT-NEXT: cmp x10, x8 ; CHECK-CVT-NEXT: fcvtzs x12, s3 -; CHECK-CVT-NEXT: csel x9, x9, x8, lt +; CHECK-CVT-NEXT: csel x10, x10, x8, lt ; CHECK-CVT-NEXT: mov h2, v0.h[1] ; CHECK-CVT-NEXT: fcvt s3, h0 -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x4, x9, x11, gt -; CHECK-CVT-NEXT: cmp x10, x8 -; CHECK-CVT-NEXT: csel x9, x10, x8, lt -; CHECK-CVT-NEXT: fcvtzs x10, s1 +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x4, x10, x9, gt +; CHECK-CVT-NEXT: cmp x11, x8 +; CHECK-CVT-NEXT: csel x10, x11, x8, lt +; CHECK-CVT-NEXT: fcvtzs x11, s1 ; CHECK-CVT-NEXT: mov h1, v0.h[2] -; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: cmp x10, x9 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h0, v0.h[3] -; CHECK-CVT-NEXT: csel x5, x9, x11, gt +; CHECK-CVT-NEXT: csel x5, x10, x9, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x9, x12, x8, lt +; CHECK-CVT-NEXT: csel x10, x12, x8, lt ; CHECK-CVT-NEXT: fcvtzs x12, s3 -; CHECK-CVT-NEXT: cmp x9, x11 +; CHECK-CVT-NEXT: cmp x10, x9 ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: csel x6, x9, x11, gt -; CHECK-CVT-NEXT: cmp x10, x8 +; CHECK-CVT-NEXT: csel x6, x10, x9, gt +; CHECK-CVT-NEXT: cmp x11, x8 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: csel x9, x10, x8, lt -; CHECK-CVT-NEXT: fcvtzs x10, s2 -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x7, x9, x11, gt +; CHECK-CVT-NEXT: csel x10, x11, x8, lt +; CHECK-CVT-NEXT: fcvtzs x11, s2 +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x7, x10, x9, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x9, x12, x8, lt +; CHECK-CVT-NEXT: csel x10, x12, x8, lt ; CHECK-CVT-NEXT: fcvtzs x12, s1 -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x0, x9, x11, gt -; CHECK-CVT-NEXT: cmp x10, x8 -; CHECK-CVT-NEXT: csel x9, x10, x8, lt -; CHECK-CVT-NEXT: fcvtzs x10, s0 -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x1, x9, x11, gt +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x0, x10, x9, gt +; CHECK-CVT-NEXT: cmp x11, x8 +; CHECK-CVT-NEXT: csel x10, x11, x8, lt +; CHECK-CVT-NEXT: fcvtzs x11, s0 +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x1, x10, x9, gt ; CHECK-CVT-NEXT: cmp x12, x8 -; CHECK-CVT-NEXT: csel x9, x12, x8, lt -; CHECK-CVT-NEXT: cmp x9, x11 -; CHECK-CVT-NEXT: csel x2, x9, x11, gt -; CHECK-CVT-NEXT: cmp x10, x8 -; CHECK-CVT-NEXT: csel x8, x10, x8, lt -; CHECK-CVT-NEXT: cmp x8, x11 -; CHECK-CVT-NEXT: csel x3, x8, x11, gt +; CHECK-CVT-NEXT: csel x10, x12, x8, lt +; CHECK-CVT-NEXT: cmp x10, x9 +; CHECK-CVT-NEXT: csel x2, x10, x9, gt +; CHECK-CVT-NEXT: cmp x11, x8 +; CHECK-CVT-NEXT: csel x8, x11, x8, lt +; CHECK-CVT-NEXT: cmp x8, x9 +; CHECK-CVT-NEXT: csel x3, x8, x9, gt ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_signed_v8f16_v8i50: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-FP16-NEXT: mov x8, #562949953421311 // =0x1ffffffffffff -; CHECK-FP16-NEXT: mov x11, #-562949953421312 // =0xfffe000000000000 +; CHECK-FP16-NEXT: mov x9, #-562949953421312 // =0xfffe000000000000 ; CHECK-FP16-NEXT: mov h2, v1.h[1] -; CHECK-FP16-NEXT: fcvtzs x9, h1 +; CHECK-FP16-NEXT: fcvtzs x10, h1 ; CHECK-FP16-NEXT: mov h3, v1.h[2] ; CHECK-FP16-NEXT: mov h1, v1.h[3] -; CHECK-FP16-NEXT: fcvtzs x10, h2 -; CHECK-FP16-NEXT: cmp x9, x8 +; CHECK-FP16-NEXT: fcvtzs x11, h2 +; CHECK-FP16-NEXT: cmp x10, x8 ; CHECK-FP16-NEXT: fcvtzs x12, h3 -; CHECK-FP16-NEXT: csel x9, x9, x8, lt +; CHECK-FP16-NEXT: csel x10, x10, x8, lt ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x4, x9, x11, gt -; CHECK-FP16-NEXT: cmp x10, x8 -; CHECK-FP16-NEXT: csel x9, x10, x8, lt -; CHECK-FP16-NEXT: fcvtzs x10, h1 +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x4, x10, x9, gt +; CHECK-FP16-NEXT: cmp x11, x8 +; CHECK-FP16-NEXT: csel x10, x11, x8, lt +; CHECK-FP16-NEXT: fcvtzs x11, h1 ; CHECK-FP16-NEXT: mov h1, v0.h[1] -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x5, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x5, x10, x9, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x9, x12, x8, lt +; CHECK-FP16-NEXT: csel x10, x12, x8, lt ; CHECK-FP16-NEXT: fcvtzs x12, h0 ; CHECK-FP16-NEXT: mov h0, v0.h[3] -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x6, x9, x11, gt -; CHECK-FP16-NEXT: cmp x10, x8 -; CHECK-FP16-NEXT: csel x9, x10, x8, lt -; CHECK-FP16-NEXT: fcvtzs x10, h1 -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x7, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x6, x10, x9, gt +; CHECK-FP16-NEXT: cmp x11, x8 +; CHECK-FP16-NEXT: csel x10, x11, x8, lt +; CHECK-FP16-NEXT: fcvtzs x11, h1 +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x7, x10, x9, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x9, x12, x8, lt +; CHECK-FP16-NEXT: csel x10, x12, x8, lt ; CHECK-FP16-NEXT: fcvtzs x12, h2 -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x0, x9, x11, gt -; CHECK-FP16-NEXT: cmp x10, x8 -; CHECK-FP16-NEXT: csel x9, x10, x8, lt -; CHECK-FP16-NEXT: fcvtzs x10, h0 -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x1, x9, x11, gt +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x0, x10, x9, gt +; CHECK-FP16-NEXT: cmp x11, x8 +; CHECK-FP16-NEXT: csel x10, x11, x8, lt +; CHECK-FP16-NEXT: fcvtzs x11, h0 +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x1, x10, x9, gt ; CHECK-FP16-NEXT: cmp x12, x8 -; CHECK-FP16-NEXT: csel x9, x12, x8, lt -; CHECK-FP16-NEXT: cmp x9, x11 -; CHECK-FP16-NEXT: csel x2, x9, x11, gt -; CHECK-FP16-NEXT: cmp x10, x8 -; CHECK-FP16-NEXT: csel x8, x10, x8, lt -; CHECK-FP16-NEXT: cmp x8, x11 -; CHECK-FP16-NEXT: csel x3, x8, x11, gt +; CHECK-FP16-NEXT: csel x10, x12, x8, lt +; CHECK-FP16-NEXT: cmp x10, x9 +; CHECK-FP16-NEXT: csel x2, x10, x9, gt +; CHECK-FP16-NEXT: cmp x11, x8 +; CHECK-FP16-NEXT: csel x8, x11, x8, lt +; CHECK-FP16-NEXT: cmp x8, x9 +; CHECK-FP16-NEXT: csel x3, x8, x9, gt ; CHECK-FP16-NEXT: ret %x = call <8 x i50> @llvm.fptosi.sat.v8f16.v8i50(<8 x half> %f) ret <8 x i50> %x @@ -2596,11 +2596,11 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: mov w8, #1895825407 // =0x70ffffff ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: mov x21, #-34359738368 // =0xfffffff800000000 +; CHECK-NEXT: mov x22, #-34359738368 // =0xfffffff800000000 ; CHECK-NEXT: mov x23, #34359738367 // =0x7ffffffff ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le @@ -2616,7 +2616,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le @@ -2630,7 +2630,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le @@ -2645,14 +2645,14 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[1] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x28, xzr, x8, vs +; CHECK-NEXT: csel x27, xzr, x8, vs ; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 @@ -2660,60 +2660,60 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x27, xzr, x8, vs -; CHECK-NEXT: csel x20, xzr, x9, vs +; CHECK-NEXT: csel x20, xzr, x8, vs +; CHECK-NEXT: csel x21, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x22, xzr, x8, vs -; CHECK-NEXT: csel x29, xzr, x9, vs +; CHECK-NEXT: csel x28, xzr, x8, vs +; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x24, xzr, x8, vs -; CHECK-NEXT: csel x25, xzr, x9, vs +; CHECK-NEXT: csel x25, xzr, x8, vs +; CHECK-NEXT: csel x29, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr x9, [sp] // 8-byte Folded Reload -; CHECK-NEXT: extr x8, x29, x22, #28 +; CHECK-NEXT: extr x8, x24, x28, #28 ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: bfi x24, x20, #36, #28 -; CHECK-NEXT: lsr x11, x27, #28 +; CHECK-NEXT: bfi x25, x21, #36, #28 +; CHECK-NEXT: lsr x11, x20, #28 ; CHECK-NEXT: stur x9, [x19, #75] -; CHECK-NEXT: extr x9, x27, x20, #28 +; CHECK-NEXT: extr x9, x20, x21, #28 ; CHECK-NEXT: stur x8, [x19, #41] -; CHECK-NEXT: csel x8, x21, x1, lt +; CHECK-NEXT: csel x8, x22, x1, lt ; CHECK-NEXT: str x9, [x19, #16] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: stp x25, x24, [x19] +; CHECK-NEXT: stp x29, x25, [x19] ; CHECK-NEXT: stur x10, [x19, #50] -; CHECK-NEXT: lsr x10, x29, #28 +; CHECK-NEXT: lsr x10, x24, #28 ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x23, x8, gt ; CHECK-NEXT: fcmp s8, s8 @@ -2723,9 +2723,9 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x8, xzr, x8, vs ; CHECK-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: bfi x8, x22, #36, #28 +; CHECK-NEXT: bfi x8, x28, #36, #28 ; CHECK-NEXT: extr x10, x14, x12, #28 -; CHECK-NEXT: bfi x28, x12, #36, #28 +; CHECK-NEXT: bfi x27, x12, #36, #28 ; CHECK-NEXT: ldr x12, [sp, #72] // 8-byte Folded Reload ; CHECK-NEXT: bfi x26, x13, #36, #28 ; CHECK-NEXT: stur x9, [x19, #25] @@ -2734,7 +2734,7 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: stur x8, [x19, #33] ; CHECK-NEXT: lsr x8, x12, #28 ; CHECK-NEXT: stur x10, [x19, #91] -; CHECK-NEXT: stur x28, [x19, #83] +; CHECK-NEXT: stur x27, [x19, #83] ; CHECK-NEXT: stur x11, [x19, #66] ; CHECK-NEXT: stur x26, [x19, #58] ; CHECK-NEXT: strb w9, [x19, #99] @@ -2792,14 +2792,14 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: mov w8, #2130706431 // =0x7effffff ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fmov s10, w8 -; CHECK-NEXT: mov x23, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NEXT: mov x22, #9223372036854775807 // =0x7fffffffffffffff +; CHECK-NEXT: mov x22, #-9223372036854775808 // =0x8000000000000000 +; CHECK-NEXT: mov x23, #9223372036854775807 // =0x7fffffffffffffff ; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2813,9 +2813,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2828,9 +2828,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2843,9 +2843,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2857,9 +2857,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2871,9 +2871,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2885,9 +2885,9 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 @@ -2900,10 +2900,10 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) { ; CHECK-NEXT: stp x24, x25, [x19, #16] ; CHECK-NEXT: stp x20, x21, [x19] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x23, x1, lt +; CHECK-NEXT: csel x9, x22, x1, lt ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: stp x28, x29, [x19, #112] -; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csel x9, x23, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: csel x9, xzr, x9, vs @@ -3030,6 +3030,7 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: fcvtzs w10, s2 +; CHECK-CVT-NEXT: fcvtzs w16, s1 ; CHECK-CVT-NEXT: fcvtzs w9, s3 ; CHECK-CVT-NEXT: mov s3, v2.s[2] ; CHECK-CVT-NEXT: mov s2, v2.s[3] @@ -3041,6 +3042,7 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtzs w14, s2 ; CHECK-CVT-NEXT: cmn w11, #128 ; CHECK-CVT-NEXT: mov s2, v1.s[2] +; CHECK-CVT-NEXT: mov s1, v1.s[3] ; CHECK-CVT-NEXT: csel w11, w11, w9, gt ; CHECK-CVT-NEXT: cmp w10, #127 ; CHECK-CVT-NEXT: csel w10, w10, w8, lt @@ -3050,56 +3052,53 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: csel w13, w10, w9, gt ; CHECK-CVT-NEXT: cmp w12, #127 -; CHECK-CVT-NEXT: fcvtzs w16, s2 +; CHECK-CVT-NEXT: fcvtzs w17, s1 ; CHECK-CVT-NEXT: csel w10, w12, w8, lt ; CHECK-CVT-NEXT: cmn w10, #128 -; CHECK-CVT-NEXT: mov s2, v3.s[1] +; CHECK-CVT-NEXT: mov s1, v3.s[2] ; CHECK-CVT-NEXT: fcvtzs w0, s3 ; CHECK-CVT-NEXT: csel w10, w10, w9, gt ; CHECK-CVT-NEXT: cmp w14, #127 ; CHECK-CVT-NEXT: fcvtzs w4, s0 ; CHECK-CVT-NEXT: csel w12, w14, w8, lt -; CHECK-CVT-NEXT: fcvtzs w14, s1 -; CHECK-CVT-NEXT: mov s1, v1.s[3] ; CHECK-CVT-NEXT: cmn w12, #128 ; CHECK-CVT-NEXT: csel w12, w12, w9, gt ; CHECK-CVT-NEXT: cmp w15, #127 -; CHECK-CVT-NEXT: fcvtzs w18, s2 -; CHECK-CVT-NEXT: csel w15, w15, w8, lt -; CHECK-CVT-NEXT: mov s2, v3.s[3] -; CHECK-CVT-NEXT: cmn w15, #128 -; CHECK-CVT-NEXT: fcvtzs w17, s1 -; CHECK-CVT-NEXT: mov s1, v3.s[2] -; CHECK-CVT-NEXT: csel w15, w15, w9, gt -; CHECK-CVT-NEXT: cmp w14, #127 -; CHECK-CVT-NEXT: csel w14, w14, w8, lt +; CHECK-CVT-NEXT: fcvtzs w1, s1 +; CHECK-CVT-NEXT: csel w14, w15, w8, lt +; CHECK-CVT-NEXT: fcvtzs w15, s2 +; CHECK-CVT-NEXT: mov s2, v3.s[1] ; CHECK-CVT-NEXT: cmn w14, #128 -; CHECK-CVT-NEXT: fcvtzs w2, s2 -; CHECK-CVT-NEXT: fmov s2, w13 +; CHECK-CVT-NEXT: mov s1, v0.s[1] ; CHECK-CVT-NEXT: csel w14, w14, w9, gt ; CHECK-CVT-NEXT: cmp w16, #127 -; CHECK-CVT-NEXT: fcvtzs w1, s1 ; CHECK-CVT-NEXT: csel w16, w16, w8, lt -; CHECK-CVT-NEXT: mov s1, v0.s[1] ; CHECK-CVT-NEXT: cmn w16, #128 -; CHECK-CVT-NEXT: mov v2.s[1], w11 +; CHECK-CVT-NEXT: fcvtzs w18, s2 +; CHECK-CVT-NEXT: mov s2, v3.s[3] ; CHECK-CVT-NEXT: csel w16, w16, w9, gt +; CHECK-CVT-NEXT: cmp w15, #127 +; CHECK-CVT-NEXT: fcvtzs w3, s1 +; CHECK-CVT-NEXT: csel w15, w15, w8, lt +; CHECK-CVT-NEXT: mov s1, v0.s[2] +; CHECK-CVT-NEXT: mov s0, v0.s[3] +; CHECK-CVT-NEXT: cmn w15, #128 +; CHECK-CVT-NEXT: csel w15, w15, w9, gt ; CHECK-CVT-NEXT: cmp w17, #127 +; CHECK-CVT-NEXT: fcvtzs w2, s2 ; CHECK-CVT-NEXT: csel w17, w17, w8, lt +; CHECK-CVT-NEXT: fmov s2, w13 ; CHECK-CVT-NEXT: cmn w17, #128 -; CHECK-CVT-NEXT: fcvtzs w3, s1 -; CHECK-CVT-NEXT: mov s1, v0.s[2] ; CHECK-CVT-NEXT: csel w17, w17, w9, gt ; CHECK-CVT-NEXT: cmp w18, #127 -; CHECK-CVT-NEXT: mov s0, v0.s[3] ; CHECK-CVT-NEXT: csel w18, w18, w8, lt -; CHECK-CVT-NEXT: mov v2.s[2], w10 +; CHECK-CVT-NEXT: mov v2.s[1], w11 ; CHECK-CVT-NEXT: cmn w18, #128 ; CHECK-CVT-NEXT: csel w18, w18, w9, gt ; CHECK-CVT-NEXT: cmp w0, #127 ; CHECK-CVT-NEXT: csel w0, w0, w8, lt ; CHECK-CVT-NEXT: cmn w0, #128 -; CHECK-CVT-NEXT: mov v2.s[3], w12 +; CHECK-CVT-NEXT: mov v2.s[2], w10 ; CHECK-CVT-NEXT: csel w0, w0, w9, gt ; CHECK-CVT-NEXT: cmp w1, #127 ; CHECK-CVT-NEXT: csel w1, w1, w8, lt @@ -3107,6 +3106,7 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: cmn w1, #128 ; CHECK-CVT-NEXT: csel w1, w1, w9, gt ; CHECK-CVT-NEXT: cmp w2, #127 +; CHECK-CVT-NEXT: mov v2.s[3], w12 ; CHECK-CVT-NEXT: csel w2, w2, w8, lt ; CHECK-CVT-NEXT: mov v3.s[1], w18 ; CHECK-CVT-NEXT: cmn w2, #128 @@ -3119,18 +3119,18 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) { ; CHECK-CVT-NEXT: cmp w4, #127 ; CHECK-CVT-NEXT: csel w3, w4, w8, lt ; CHECK-CVT-NEXT: fcvtzs w4, s1 -; CHECK-CVT-NEXT: fmov s1, w14 +; CHECK-CVT-NEXT: fmov s1, w16 ; CHECK-CVT-NEXT: cmn w3, #128 ; CHECK-CVT-NEXT: csel w11, w3, w9, gt ; CHECK-CVT-NEXT: mov v3.s[3], w2 ; CHECK-CVT-NEXT: fmov s4, w11 -; CHECK-CVT-NEXT: mov v1.s[1], w15 +; CHECK-CVT-NEXT: mov v1.s[1], w14 ; CHECK-CVT-NEXT: fcvtzs w11, s0 ; CHECK-CVT-NEXT: cmp w4, #127 ; CHECK-CVT-NEXT: mov v4.s[1], w13 ; CHECK-CVT-NEXT: csel w13, w4, w8, lt ; CHECK-CVT-NEXT: cmn w13, #128 -; CHECK-CVT-NEXT: mov v1.s[2], w16 +; CHECK-CVT-NEXT: mov v1.s[2], w15 ; CHECK-CVT-NEXT: csel w10, w13, w9, gt ; CHECK-CVT-NEXT: cmp w11, #127 ; CHECK-CVT-NEXT: csel w8, w11, w8, lt @@ -3163,6 +3163,7 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: fcvtzs w10, s2 +; CHECK-CVT-NEXT: fcvtzs w16, s0 ; CHECK-CVT-NEXT: fcvtzs w9, s3 ; CHECK-CVT-NEXT: mov s3, v2.s[2] ; CHECK-CVT-NEXT: mov s2, v2.s[3] @@ -3174,6 +3175,7 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtzs w14, s2 ; CHECK-CVT-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: mov s2, v0.s[2] +; CHECK-CVT-NEXT: mov s0, v0.s[3] ; CHECK-CVT-NEXT: csel w11, w11, w9, gt ; CHECK-CVT-NEXT: cmp w10, w8 ; CHECK-CVT-NEXT: csel w10, w10, w8, lt @@ -3183,55 +3185,52 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: csel w13, w10, w9, gt ; CHECK-CVT-NEXT: cmp w12, w8 -; CHECK-CVT-NEXT: fcvtzs w16, s2 +; CHECK-CVT-NEXT: fcvtzs w17, s0 ; CHECK-CVT-NEXT: csel w10, w12, w8, lt ; CHECK-CVT-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: mov s2, v3.s[1] +; CHECK-CVT-NEXT: mov s0, v3.s[2] ; CHECK-CVT-NEXT: fcvtzs w0, s3 ; CHECK-CVT-NEXT: csel w10, w10, w9, gt ; CHECK-CVT-NEXT: cmp w14, w8 ; CHECK-CVT-NEXT: fcvtzs w4, s1 ; CHECK-CVT-NEXT: csel w12, w14, w8, lt -; CHECK-CVT-NEXT: fcvtzs w14, s0 -; CHECK-CVT-NEXT: mov s0, v0.s[3] ; CHECK-CVT-NEXT: cmn w12, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w12, w12, w9, gt ; CHECK-CVT-NEXT: cmp w15, w8 -; CHECK-CVT-NEXT: fcvtzs w18, s2 -; CHECK-CVT-NEXT: csel w15, w15, w8, lt -; CHECK-CVT-NEXT: mov s2, v3.s[3] -; CHECK-CVT-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fcvtzs w17, s0 -; CHECK-CVT-NEXT: mov s0, v3.s[2] -; CHECK-CVT-NEXT: csel w15, w15, w9, gt -; CHECK-CVT-NEXT: cmp w14, w8 -; CHECK-CVT-NEXT: csel w14, w14, w8, lt +; CHECK-CVT-NEXT: fcvtzs w1, s0 +; CHECK-CVT-NEXT: csel w14, w15, w8, lt +; CHECK-CVT-NEXT: fcvtzs w15, s2 +; CHECK-CVT-NEXT: mov s2, v3.s[1] ; CHECK-CVT-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fcvtzs w2, s2 -; CHECK-CVT-NEXT: fmov s2, w13 +; CHECK-CVT-NEXT: mov s0, v1.s[1] ; CHECK-CVT-NEXT: csel w14, w14, w9, gt ; CHECK-CVT-NEXT: cmp w16, w8 -; CHECK-CVT-NEXT: fcvtzs w1, s0 ; CHECK-CVT-NEXT: csel w16, w16, w8, lt -; CHECK-CVT-NEXT: mov s0, v1.s[1] ; CHECK-CVT-NEXT: cmn w16, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: mov v2.s[1], w11 +; CHECK-CVT-NEXT: fcvtzs w18, s2 +; CHECK-CVT-NEXT: mov s2, v3.s[3] ; CHECK-CVT-NEXT: csel w16, w16, w9, gt +; CHECK-CVT-NEXT: cmp w15, w8 +; CHECK-CVT-NEXT: fcvtzs w3, s0 +; CHECK-CVT-NEXT: csel w15, w15, w8, lt +; CHECK-CVT-NEXT: mov s0, v1.s[2] +; CHECK-CVT-NEXT: cmn w15, #8, lsl #12 // =32768 +; CHECK-CVT-NEXT: csel w15, w15, w9, gt ; CHECK-CVT-NEXT: cmp w17, w8 +; CHECK-CVT-NEXT: fcvtzs w2, s2 ; CHECK-CVT-NEXT: csel w17, w17, w8, lt +; CHECK-CVT-NEXT: fmov s2, w13 ; CHECK-CVT-NEXT: cmn w17, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fcvtzs w3, s0 -; CHECK-CVT-NEXT: mov s0, v1.s[2] ; CHECK-CVT-NEXT: csel w17, w17, w9, gt ; CHECK-CVT-NEXT: cmp w18, w8 -; CHECK-CVT-NEXT: mov v2.s[2], w10 ; CHECK-CVT-NEXT: csel w18, w18, w8, lt +; CHECK-CVT-NEXT: mov v2.s[1], w11 ; CHECK-CVT-NEXT: cmn w18, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w18, w18, w9, gt ; CHECK-CVT-NEXT: cmp w0, w8 ; CHECK-CVT-NEXT: csel w0, w0, w8, lt -; CHECK-CVT-NEXT: mov v2.s[3], w12 ; CHECK-CVT-NEXT: cmn w0, #8, lsl #12 // =32768 +; CHECK-CVT-NEXT: mov v2.s[2], w10 ; CHECK-CVT-NEXT: csel w0, w0, w9, gt ; CHECK-CVT-NEXT: cmp w1, w8 ; CHECK-CVT-NEXT: csel w1, w1, w8, lt @@ -3239,6 +3238,7 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: cmn w1, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w1, w1, w9, gt ; CHECK-CVT-NEXT: cmp w2, w8 +; CHECK-CVT-NEXT: mov v2.s[3], w12 ; CHECK-CVT-NEXT: csel w2, w2, w8, lt ; CHECK-CVT-NEXT: mov v3.s[1], w18 ; CHECK-CVT-NEXT: cmn w2, #8, lsl #12 // =32768 @@ -3253,18 +3253,18 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) { ; CHECK-CVT-NEXT: fcvtzs w4, s0 ; CHECK-CVT-NEXT: mov s0, v1.s[3] ; CHECK-CVT-NEXT: cmn w3, #8, lsl #12 // =32768 -; CHECK-CVT-NEXT: fmov s1, w14 +; CHECK-CVT-NEXT: fmov s1, w16 ; CHECK-CVT-NEXT: csel w11, w3, w9, gt ; CHECK-CVT-NEXT: mov v3.s[3], w2 ; CHECK-CVT-NEXT: fmov s4, w11 -; CHECK-CVT-NEXT: mov v1.s[1], w15 +; CHECK-CVT-NEXT: mov v1.s[1], w14 ; CHECK-CVT-NEXT: cmp w4, w8 ; CHECK-CVT-NEXT: fcvtzs w11, s0 ; CHECK-CVT-NEXT: mov v4.s[1], w13 ; CHECK-CVT-NEXT: csel w13, w4, w8, lt ; CHECK-CVT-NEXT: cmn w13, #8, lsl #12 // =32768 ; CHECK-CVT-NEXT: csel w10, w13, w9, gt -; CHECK-CVT-NEXT: mov v1.s[2], w16 +; CHECK-CVT-NEXT: mov v1.s[2], w15 ; CHECK-CVT-NEXT: cmp w11, w8 ; CHECK-CVT-NEXT: csel w8, w11, w8, lt ; CHECK-CVT-NEXT: mov v4.s[2], w10 @@ -3289,9 +3289,8 @@ define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov d4, v3.d[1] -; CHECK-NEXT: fcvtzs w9, d3 -; CHECK-NEXT: mov w10, #127 // =0x7f -; CHECK-NEXT: mov w11, #-128 // =0xffffff80 +; CHECK-NEXT: fcvtzs w11, d3 +; CHECK-NEXT: mov w9, #127 // =0x7f ; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: fcvtzs w13, d2 ; CHECK-NEXT: fcvtzs w15, d1 @@ -3303,47 +3302,48 @@ define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) { ; CHECK-NEXT: cmp w8, #127 ; CHECK-NEXT: fcvtzs w12, d4 ; CHECK-NEXT: fcvtzs w16, d2 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #128 -; CHECK-NEXT: csel w8, w8, w11, gt -; CHECK-NEXT: cmp w9, #127 -; CHECK-NEXT: csel w9, w9, w10, lt -; CHECK-NEXT: cmn w9, #128 -; CHECK-NEXT: csel w9, w9, w11, gt +; CHECK-NEXT: csel w10, w8, w9, lt +; CHECK-NEXT: mov w8, #-128 // =0xffffff80 +; CHECK-NEXT: cmn w10, #128 +; CHECK-NEXT: csel w10, w10, w8, gt +; CHECK-NEXT: cmp w11, #127 +; CHECK-NEXT: csel w11, w11, w9, lt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: csel w11, w11, w8, gt ; CHECK-NEXT: cmp w12, #127 -; CHECK-NEXT: csel w12, w12, w10, lt -; CHECK-NEXT: fmov s3, w9 +; CHECK-NEXT: csel w12, w12, w9, lt +; CHECK-NEXT: fmov s3, w11 ; CHECK-NEXT: cmn w12, #128 -; CHECK-NEXT: csel w12, w12, w11, gt +; CHECK-NEXT: csel w12, w12, w8, gt ; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: csel w13, w13, w10, lt -; CHECK-NEXT: mov v3.s[1], w8 +; CHECK-NEXT: csel w13, w13, w9, lt +; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: csel w13, w13, w11, gt +; CHECK-NEXT: csel w13, w13, w8, gt ; CHECK-NEXT: cmp w14, #127 -; CHECK-NEXT: csel w14, w14, w10, lt +; CHECK-NEXT: csel w14, w14, w9, lt ; CHECK-NEXT: fmov s2, w13 ; CHECK-NEXT: cmn w14, #128 -; CHECK-NEXT: csel w14, w14, w11, gt +; CHECK-NEXT: csel w14, w14, w8, gt ; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: csel w15, w15, w10, lt +; CHECK-NEXT: csel w15, w15, w9, lt ; CHECK-NEXT: mov v2.s[1], w12 ; CHECK-NEXT: cmn w15, #128 -; CHECK-NEXT: csel w15, w15, w11, gt +; CHECK-NEXT: csel w15, w15, w8, gt ; CHECK-NEXT: cmp w16, #127 -; CHECK-NEXT: csel w9, w16, w10, lt +; CHECK-NEXT: csel w11, w16, w9, lt ; CHECK-NEXT: fmov s1, w15 -; CHECK-NEXT: cmn w9, #128 -; CHECK-NEXT: csel w8, w9, w11, gt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: csel w10, w11, w8, gt ; CHECK-NEXT: cmp w17, #127 -; CHECK-NEXT: csel w9, w17, w10, lt +; CHECK-NEXT: csel w9, w17, w9, lt ; CHECK-NEXT: mov v1.s[1], w14 ; CHECK-NEXT: cmn w9, #128 -; CHECK-NEXT: csel w9, w9, w11, gt -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: csel w8, w9, w8, gt +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: adrp x8, .LCPI82_0 ; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI82_0] +; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f) @@ -3491,61 +3491,61 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) { ; CHECK-LABEL: test_signed_v8f64_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov d4, v3.d[1] -; CHECK-NEXT: mov w9, #32767 // =0x7fff -; CHECK-NEXT: fcvtzs w10, d3 -; CHECK-NEXT: mov w11, #-32768 // =0xffff8000 +; CHECK-NEXT: mov w8, #32767 // =0x7fff +; CHECK-NEXT: fcvtzs w11, d3 ; CHECK-NEXT: mov d3, v1.d[1] ; CHECK-NEXT: fcvtzs w13, d2 ; CHECK-NEXT: fcvtzs w15, d1 ; CHECK-NEXT: fcvtzs w17, d0 -; CHECK-NEXT: fcvtzs w8, d4 +; CHECK-NEXT: fcvtzs w9, d4 ; CHECK-NEXT: mov d4, v2.d[1] ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: fcvtzs w14, d3 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: fcvtzs w12, d4 ; CHECK-NEXT: fcvtzs w16, d2 -; CHECK-NEXT: csel w8, w8, w9, lt -; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w8, w8, w11, gt -; CHECK-NEXT: cmp w10, w9 -; CHECK-NEXT: csel w10, w10, w9, lt +; CHECK-NEXT: csel w10, w9, w8, lt +; CHECK-NEXT: mov w9, #-32768 // =0xffff8000 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w10, w10, w11, gt -; CHECK-NEXT: cmp w12, w9 -; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: fmov s3, w10 +; CHECK-NEXT: csel w10, w10, w9, gt +; CHECK-NEXT: cmp w11, w8 +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w11, w11, w9, gt +; CHECK-NEXT: cmp w12, w8 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: fmov s3, w11 ; CHECK-NEXT: cmn w12, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w12, w12, w11, gt -; CHECK-NEXT: cmp w13, w9 -; CHECK-NEXT: csel w13, w13, w9, lt -; CHECK-NEXT: mov v3.s[1], w8 +; CHECK-NEXT: csel w12, w12, w9, gt +; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: csel w13, w13, w8, lt +; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: cmn w13, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w13, w13, w11, gt -; CHECK-NEXT: cmp w14, w9 -; CHECK-NEXT: csel w14, w14, w9, lt +; CHECK-NEXT: csel w13, w13, w9, gt +; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: csel w14, w14, w8, lt ; CHECK-NEXT: fmov s2, w13 ; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w14, w14, w11, gt -; CHECK-NEXT: cmp w15, w9 -; CHECK-NEXT: csel w15, w15, w9, lt +; CHECK-NEXT: csel w14, w14, w9, gt +; CHECK-NEXT: cmp w15, w8 +; CHECK-NEXT: csel w15, w15, w8, lt ; CHECK-NEXT: mov v2.s[1], w12 ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w15, w15, w11, gt -; CHECK-NEXT: cmp w16, w9 -; CHECK-NEXT: csel w10, w16, w9, lt +; CHECK-NEXT: csel w15, w15, w9, gt +; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: csel w11, w16, w8, lt ; CHECK-NEXT: fmov s1, w15 -; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w8, w10, w11, gt -; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: csel w9, w17, w9, lt +; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w10, w11, w9, gt +; CHECK-NEXT: cmp w17, w8 +; CHECK-NEXT: csel w8, w17, w8, lt ; CHECK-NEXT: mov v1.s[1], w14 -; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w9, w9, w11, gt -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w8, w8, w9, gt +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: adrp x8, .LCPI84_0 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI84_0] +; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f) @@ -3563,7 +3563,7 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: fcvtzs w15, d1 ; CHECK-NEXT: mov d1, v7.d[1] ; CHECK-NEXT: fcvtzs w18, d0 -; CHECK-NEXT: fcvtzs w0, d7 +; CHECK-NEXT: fcvtzs w1, d7 ; CHECK-NEXT: fcvtzs w2, d6 ; CHECK-NEXT: fcvtzs w4, d5 ; CHECK-NEXT: fcvtzs w6, d4 @@ -3571,30 +3571,26 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: mov d16, v2.d[1] ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: mov d0, v6.d[1] +; CHECK-NEXT: fcvtzs w0, d1 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: fcvtzs w13, d16 ; CHECK-NEXT: fcvtzs w17, d2 ; CHECK-NEXT: csel w10, w8, w9, lt ; CHECK-NEXT: mov w8, #-32768 // =0xffff8000 -; CHECK-NEXT: fcvtzs w1, d0 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 -; CHECK-NEXT: mov d0, v5.d[1] ; CHECK-NEXT: csel w10, w10, w8, gt ; CHECK-NEXT: cmp w11, w9 ; CHECK-NEXT: csel w11, w11, w9, lt ; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w12, w11, w8, gt ; CHECK-NEXT: cmp w13, w9 -; CHECK-NEXT: fcvtzs w3, d0 ; CHECK-NEXT: csel w11, w13, w9, lt ; CHECK-NEXT: fcvtzs w13, d3 -; CHECK-NEXT: mov d0, v4.d[1] ; CHECK-NEXT: cmn w11, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w11, w11, w8, gt ; CHECK-NEXT: cmp w14, w9 ; CHECK-NEXT: csel w14, w14, w9, lt ; CHECK-NEXT: cmn w14, #8, lsl #12 // =32768 -; CHECK-NEXT: fcvtzs w5, d0 ; CHECK-NEXT: csel w14, w14, w8, gt ; CHECK-NEXT: cmp w13, w9 ; CHECK-NEXT: csel w13, w13, w9, lt @@ -3606,60 +3602,64 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: csel w16, w15, w8, gt ; CHECK-NEXT: cmp w17, w9 ; CHECK-NEXT: csel w15, w17, w9, lt -; CHECK-NEXT: fcvtzs w17, d1 -; CHECK-NEXT: fmov s3, w12 ; CHECK-NEXT: cmn w15, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w15, w15, w8, gt ; CHECK-NEXT: cmp w18, w9 -; CHECK-NEXT: csel w18, w18, w9, lt -; CHECK-NEXT: mov v3.s[1], w10 -; CHECK-NEXT: cmn w18, #8, lsl #12 // =32768 -; CHECK-NEXT: fmov s2, w14 -; CHECK-NEXT: csel w18, w18, w8, gt -; CHECK-NEXT: cmp w17, w9 -; CHECK-NEXT: csel w17, w17, w9, lt +; CHECK-NEXT: csel w17, w18, w9, lt ; CHECK-NEXT: cmn w17, #8, lsl #12 // =32768 -; CHECK-NEXT: mov v2.s[1], w11 ; CHECK-NEXT: csel w17, w17, w8, gt ; CHECK-NEXT: cmp w0, w9 -; CHECK-NEXT: fmov s1, w16 -; CHECK-NEXT: csel w0, w0, w9, lt -; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 -; CHECK-NEXT: csel w0, w0, w8, gt +; CHECK-NEXT: csel w18, w0, w9, lt +; CHECK-NEXT: fcvtzs w0, d0 +; CHECK-NEXT: mov d0, v5.d[1] +; CHECK-NEXT: cmn w18, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w18, w18, w8, gt ; CHECK-NEXT: cmp w1, w9 -; CHECK-NEXT: mov v1.s[1], w13 ; CHECK-NEXT: csel w1, w1, w9, lt -; CHECK-NEXT: fmov s7, w0 -; CHECK-NEXT: fmov s0, w18 ; CHECK-NEXT: cmn w1, #8, lsl #12 // =32768 +; CHECK-NEXT: fcvtzs w3, d0 +; CHECK-NEXT: mov d0, v4.d[1] ; CHECK-NEXT: csel w1, w1, w8, gt +; CHECK-NEXT: cmp w0, w9 +; CHECK-NEXT: csel w0, w0, w9, lt +; CHECK-NEXT: fmov s7, w1 +; CHECK-NEXT: cmn w0, #8, lsl #12 // =32768 +; CHECK-NEXT: csel w0, w0, w8, gt ; CHECK-NEXT: cmp w2, w9 +; CHECK-NEXT: fcvtzs w5, d0 ; CHECK-NEXT: csel w2, w2, w9, lt -; CHECK-NEXT: mov v7.s[1], w17 -; CHECK-NEXT: mov v0.s[1], w15 +; CHECK-NEXT: fmov s3, w12 +; CHECK-NEXT: mov v7.s[1], w18 ; CHECK-NEXT: cmn w2, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w2, w2, w8, gt ; CHECK-NEXT: cmp w3, w9 ; CHECK-NEXT: csel w3, w3, w9, lt +; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: fmov s6, w2 ; CHECK-NEXT: cmn w3, #8, lsl #12 // =32768 +; CHECK-NEXT: fmov s2, w14 ; CHECK-NEXT: csel w3, w3, w8, gt ; CHECK-NEXT: cmp w4, w9 ; CHECK-NEXT: csel w4, w4, w9, lt -; CHECK-NEXT: mov v6.s[1], w1 +; CHECK-NEXT: mov v6.s[1], w0 ; CHECK-NEXT: cmn w4, #8, lsl #12 // =32768 +; CHECK-NEXT: mov v2.s[1], w11 ; CHECK-NEXT: csel w12, w4, w8, gt ; CHECK-NEXT: cmp w5, w9 +; CHECK-NEXT: fmov s1, w16 ; CHECK-NEXT: csel w10, w5, w9, lt ; CHECK-NEXT: fmov s5, w12 ; CHECK-NEXT: cmn w10, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w10, w10, w8, gt ; CHECK-NEXT: cmp w6, w9 +; CHECK-NEXT: mov v1.s[1], w13 ; CHECK-NEXT: csel w9, w6, w9, lt ; CHECK-NEXT: mov v5.s[1], w3 +; CHECK-NEXT: fmov s0, w17 ; CHECK-NEXT: cmn w9, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w8, w9, w8, gt ; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: mov v0.s[1], w15 ; CHECK-NEXT: adrp x8, .LCPI85_0 ; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI85_0] ; CHECK-NEXT: mov v4.s[1], w10 diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index f23254cbf7b22..c94db3484994c 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2196,13 +2196,13 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: mov x22, #68719476735 // =0xfffffffff +; CHECK-NEXT: mov x23, #68719476735 // =0xfffffffff ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x10, x22, x8, gt +; CHECK-NEXT: csel x10, x23, x8, gt ; CHECK-NEXT: csinv x8, x9, xzr, le ; CHECK-NEXT: stp x8, x10, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 @@ -2213,9 +2213,9 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x9, x22, x9, gt -; CHECK-NEXT: csinv x24, x8, xzr, le -; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: stp x8, x9, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload @@ -2226,7 +2226,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x25, x22, x9, gt +; CHECK-NEXT: csel x25, x23, x9, gt ; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti @@ -2237,9 +2237,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x26, x22, x9, gt -; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill +; CHECK-NEXT: csel x26, x23, x9, gt +; CHECK-NEXT: csinv x28, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2249,8 +2248,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x29, x22, x9, gt -; CHECK-NEXT: csinv x27, x8, xzr, le +; CHECK-NEXT: csel x29, x23, x9, gt +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 @@ -2259,8 +2258,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x20, x22, x9, gt -; CHECK-NEXT: csinv x21, x8, xzr, le +; CHECK-NEXT: csel x21, x23, x9, gt +; CHECK-NEXT: csinv x27, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2270,36 +2269,35 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) { ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x28, x22, x9, gt -; CHECK-NEXT: csinv x23, x8, xzr, le +; CHECK-NEXT: csel x22, x23, x9, gt +; CHECK-NEXT: csinv x24, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr x9, [sp] // 8-byte Folded Reload -; CHECK-NEXT: extr x8, x20, x21, #28 +; CHECK-NEXT: extr x8, x21, x27, #28 +; CHECK-NEXT: extr x9, x29, x20, #28 +; CHECK-NEXT: stur x28, [x19, #75] ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: bfi x28, x27, #36, #28 +; CHECK-NEXT: bfi x22, x20, #36, #28 ; CHECK-NEXT: lsr x11, x29, #28 -; CHECK-NEXT: bfi x26, x24, #36, #28 -; CHECK-NEXT: stur x9, [x19, #75] -; CHECK-NEXT: extr x9, x29, x27, #28 ; CHECK-NEXT: stur x8, [x19, #41] -; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: str x9, [x19, #16] +; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: stp x23, x28, [x19] -; CHECK-NEXT: strb w11, [x19, #24] +; CHECK-NEXT: stp x24, x22, [x19] ; CHECK-NEXT: stur x10, [x19, #50] -; CHECK-NEXT: lsr x10, x20, #28 -; CHECK-NEXT: csel x9, x22, x9, gt -; CHECK-NEXT: bfi x9, x21, #36, #28 -; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: lsr x10, x21, #28 +; CHECK-NEXT: strb w11, [x19, #24] ; CHECK-NEXT: strb w10, [x19, #49] -; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: ldp x12, x11, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bfi x9, x27, #36, #28 ; CHECK-NEXT: stur x8, [x19, #25] ; CHECK-NEXT: stur x9, [x19, #33] -; CHECK-NEXT: extr x10, x11, x24, #28 +; CHECK-NEXT: extr x10, x11, x12, #28 +; CHECK-NEXT: bfi x26, x12, #36, #28 ; CHECK-NEXT: stur x10, [x19, #91] ; CHECK-NEXT: ldp x10, x9, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: stur x26, [x19, #83] @@ -2849,56 +2847,56 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) { ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: fmov s3, w9 ; CHECK-NEXT: fcvtzu w9, d16 +; CHECK-NEXT: mov d16, v5.d[1] ; CHECK-NEXT: mov v0.b[5], w11 ; CHECK-NEXT: mov v3.s[1], w10 ; CHECK-NEXT: fcvtzu w10, d4 -; CHECK-NEXT: mov d4, v5.d[1] ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: mov w11, v3.s[1] ; CHECK-NEXT: mov v0.b[6], v3.b[0] ; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcvtzu w10, d4 -; CHECK-NEXT: mov d4, v6.d[1] +; CHECK-NEXT: fmov s4, w10 +; CHECK-NEXT: fcvtzu w10, d16 ; CHECK-NEXT: mov v0.b[7], w11 -; CHECK-NEXT: mov v16.s[1], w9 +; CHECK-NEXT: mov v4.s[1], w9 ; CHECK-NEXT: fcvtzu w9, d5 +; CHECK-NEXT: mov d5, v6.d[1] ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w9, #255 -; CHECK-NEXT: mov w11, v16.s[1] -; CHECK-NEXT: mov v0.b[8], v16.b[0] +; CHECK-NEXT: mov w11, v4.s[1] +; CHECK-NEXT: mov v0.b[8], v4.b[0] ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: fcvtzu w9, d4 -; CHECK-NEXT: mov d4, v7.d[1] +; CHECK-NEXT: fmov s16, w9 +; CHECK-NEXT: fcvtzu w9, d5 +; CHECK-NEXT: mov d5, v7.d[1] ; CHECK-NEXT: mov v0.b[9], w11 -; CHECK-NEXT: mov v5.s[1], w10 +; CHECK-NEXT: mov v16.s[1], w10 ; CHECK-NEXT: fcvtzu w10, d6 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: mov v0.b[10], v5.b[0] -; CHECK-NEXT: mov w11, v5.s[1] +; CHECK-NEXT: mov v0.b[10], v16.b[0] +; CHECK-NEXT: mov w11, v16.s[1] ; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: fmov s6, w10 ; CHECK-NEXT: fcvtzu w10, d7 ; CHECK-NEXT: mov v0.b[11], w11 ; CHECK-NEXT: mov v6.s[1], w9 -; CHECK-NEXT: fcvtzu w9, d4 +; CHECK-NEXT: fcvtzu w9, d5 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: mov v0.b[12], v6.b[0] ; CHECK-NEXT: mov w11, v6.s[1] ; CHECK-NEXT: csel w9, w9, w8, lo ; CHECK-NEXT: cmp w10, #255 ; CHECK-NEXT: csel w8, w10, w8, lo -; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: fmov s5, w8 ; CHECK-NEXT: mov v0.b[13], w11 -; CHECK-NEXT: mov v4.s[1], w9 -; CHECK-NEXT: mov v0.b[14], v4.b[0] -; CHECK-NEXT: mov w8, v4.s[1] +; CHECK-NEXT: mov v5.s[1], w9 +; CHECK-NEXT: mov v0.b[14], v5.b[0] +; CHECK-NEXT: mov w8, v5.s[1] ; CHECK-NEXT: mov v0.b[15], w8 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f) @@ -2961,76 +2959,76 @@ define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) { ; CHECK-NEXT: mov w8, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w9, d3 ; CHECK-NEXT: mov d3, v1.d[1] -; CHECK-NEXT: fcvtzu w11, d1 +; CHECK-NEXT: fcvtzu w10, d1 ; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: fcvtzu w10, d2 -; CHECK-NEXT: fcvtzu w13, d0 +; CHECK-NEXT: fcvtzu w11, d2 +; CHECK-NEXT: fcvtzu w12, d0 ; CHECK-NEXT: mov d0, v7.d[1] ; CHECK-NEXT: mov d2, v6.d[1] -; CHECK-NEXT: fcvtzu w15, d7 -; CHECK-NEXT: fcvtzu w12, d16 -; CHECK-NEXT: fcvtzu w14, d17 -; CHECK-NEXT: fcvtzu w16, d6 +; CHECK-NEXT: fcvtzu w14, d7 +; CHECK-NEXT: fcvtzu w13, d16 +; CHECK-NEXT: fcvtzu w16, d17 +; CHECK-NEXT: fcvtzu w15, d6 ; CHECK-NEXT: fcvtzu w17, d3 ; CHECK-NEXT: mov d6, v5.d[1] ; CHECK-NEXT: mov d3, v4.d[1] ; CHECK-NEXT: fcvtzu w18, d1 -; CHECK-NEXT: cmp w12, w8 -; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: csel w13, w13, w8, lo ; CHECK-NEXT: cmp w9, w8 ; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: cmp w16, w8 ; CHECK-NEXT: fmov s19, w9 -; CHECK-NEXT: csel w9, w14, w8, lo -; CHECK-NEXT: cmp w10, w8 -; CHECK-NEXT: fcvtzu w14, d0 -; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: mov v19.s[1], w12 -; CHECK-NEXT: csel w12, w17, w8, lo +; CHECK-NEXT: csel w9, w16, w8, lo ; CHECK-NEXT: cmp w11, w8 +; CHECK-NEXT: fcvtzu w16, d0 ; CHECK-NEXT: csel w11, w11, w8, lo +; CHECK-NEXT: cmp w17, w8 +; CHECK-NEXT: mov v19.s[1], w13 +; CHECK-NEXT: csel w13, w17, w8, lo +; CHECK-NEXT: cmp w10, w8 +; CHECK-NEXT: csel w10, w10, w8, lo ; CHECK-NEXT: cmp w18, w8 -; CHECK-NEXT: fmov s18, w10 -; CHECK-NEXT: csel w10, w18, w8, lo -; CHECK-NEXT: cmp w13, w8 +; CHECK-NEXT: fmov s18, w11 +; CHECK-NEXT: csel w11, w18, w8, lo +; CHECK-NEXT: cmp w12, w8 ; CHECK-NEXT: fcvtzu w17, d2 -; CHECK-NEXT: csel w13, w13, w8, lo -; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: cmp w16, w8 ; CHECK-NEXT: fcvtzu w18, d6 ; CHECK-NEXT: mov v18.s[1], w9 -; CHECK-NEXT: csel w9, w14, w8, lo -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: fmov s17, w11 -; CHECK-NEXT: csel w11, w15, w8, lo -; CHECK-NEXT: fcvtzu w14, d5 -; CHECK-NEXT: fmov s23, w11 +; CHECK-NEXT: csel w9, w16, w8, lo +; CHECK-NEXT: cmp w14, w8 +; CHECK-NEXT: fmov s17, w10 +; CHECK-NEXT: csel w10, w14, w8, lo +; CHECK-NEXT: fcvtzu w16, d5 +; CHECK-NEXT: fmov s23, w10 ; CHECK-NEXT: cmp w17, w8 -; CHECK-NEXT: fcvtzu w15, d3 -; CHECK-NEXT: csel w11, w17, w8, lo -; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: fcvtzu w14, d3 +; CHECK-NEXT: csel w10, w17, w8, lo +; CHECK-NEXT: cmp w15, w8 ; CHECK-NEXT: fcvtzu w17, d4 -; CHECK-NEXT: mov v17.s[1], w12 +; CHECK-NEXT: mov v17.s[1], w13 ; CHECK-NEXT: mov v23.s[1], w9 -; CHECK-NEXT: csel w9, w16, w8, lo +; CHECK-NEXT: csel w9, w15, w8, lo ; CHECK-NEXT: cmp w18, w8 ; CHECK-NEXT: fmov s22, w9 ; CHECK-NEXT: csel w9, w18, w8, lo +; CHECK-NEXT: cmp w16, w8 +; CHECK-NEXT: fmov s16, w12 +; CHECK-NEXT: mov v22.s[1], w10 +; CHECK-NEXT: csel w10, w16, w8, lo ; CHECK-NEXT: cmp w14, w8 -; CHECK-NEXT: fmov s16, w13 -; CHECK-NEXT: mov v22.s[1], w11 -; CHECK-NEXT: csel w11, w14, w8, lo -; CHECK-NEXT: cmp w15, w8 -; CHECK-NEXT: fmov s21, w11 -; CHECK-NEXT: csel w11, w15, w8, lo +; CHECK-NEXT: fmov s21, w10 +; CHECK-NEXT: csel w10, w14, w8, lo ; CHECK-NEXT: cmp w17, w8 ; CHECK-NEXT: csel w8, w17, w8, lo -; CHECK-NEXT: mov v16.s[1], w10 +; CHECK-NEXT: mov v16.s[1], w11 ; CHECK-NEXT: mov v21.s[1], w9 ; CHECK-NEXT: fmov s20, w8 ; CHECK-NEXT: adrp x8, .LCPI85_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI85_0] -; CHECK-NEXT: mov v20.s[1], w11 +; CHECK-NEXT: mov v20.s[1], w10 ; CHECK-NEXT: tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b ; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll index 90e93577efd9f..16a6ba3f8cc93 100644 --- a/llvm/test/CodeGen/AArch64/frem.ll +++ b/llvm/test/CodeGen/AArch64/frem.ll @@ -610,7 +610,7 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov s15, v1.s[2] ; CHECK-GI-NEXT: mov s13, v1.s[3] ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-GI-NEXT: str s2, [sp, #64] // 4-byte Folded Spill +; CHECK-GI-NEXT: str s2, [sp, #48] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v4.s[2] ; CHECK-GI-NEXT: str s2, [sp, #112] // 4-byte Folded Spill ; CHECK-GI-NEXT: mov s2, v3.s[3] @@ -628,17 +628,17 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: fmov s0, s10 ; CHECK-GI-NEXT: fmov s1, s13 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: fmov s1, s12 ; CHECK-GI-NEXT: str d0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr s0, [sp, #64] // 4-byte Folded Reload +; CHECK-GI-NEXT: ldr s0, [sp, #48] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: fmov s1, s11 -; CHECK-GI-NEXT: str d0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str d0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr s0, [sp, #112] // 4-byte Folded Reload ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: str d0, [sp, #112] // 16-byte Folded Spill @@ -651,7 +651,7 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: ldr x30, [sp, #192] // 8-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #160] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[1], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload @@ -659,7 +659,7 @@ define <8 x float> @frem_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov v1.s[2], v2.s[0] ; CHECK-GI-NEXT: ldr q2, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.s[2], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.s[3], v2.s[0] ; CHECK-GI-NEXT: mov v3.s[3], v0.s[0] ; CHECK-GI-NEXT: mov v2.16b, v1.16b @@ -777,15 +777,15 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h8, v0.h[1] -; CHECK-GI-NEXT: mov h9, v0.h[2] -; CHECK-GI-NEXT: mov h10, v0.h[3] -; CHECK-GI-NEXT: mov h11, v0.h[4] +; CHECK-GI-NEXT: mov h9, v0.h[1] +; CHECK-GI-NEXT: mov h10, v0.h[2] +; CHECK-GI-NEXT: mov h11, v0.h[3] +; CHECK-GI-NEXT: mov h12, v0.h[4] ; CHECK-GI-NEXT: mov h14, v1.h[1] ; CHECK-GI-NEXT: mov h15, v1.h[2] -; CHECK-GI-NEXT: mov h13, v1.h[3] -; CHECK-GI-NEXT: mov h12, v1.h[4] -; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h8, v1.h[3] +; CHECK-GI-NEXT: mov h13, v1.h[4] +; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill @@ -795,34 +795,34 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h8 +; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h14 -; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h9 +; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h13 +; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h12 +; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #172] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -835,19 +835,20 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload -; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] @@ -1081,17 +1082,17 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 ; CHECK-GI-NEXT: mov h2, v0.h[5] -; CHECK-GI-NEXT: mov h10, v0.h[1] -; CHECK-GI-NEXT: mov h11, v0.h[2] -; CHECK-GI-NEXT: mov h12, v0.h[3] -; CHECK-GI-NEXT: mov h13, v0.h[4] +; CHECK-GI-NEXT: mov h11, v0.h[1] +; CHECK-GI-NEXT: mov h12, v0.h[2] +; CHECK-GI-NEXT: mov h13, v0.h[3] +; CHECK-GI-NEXT: mov h14, v0.h[4] ; CHECK-GI-NEXT: mov h8, v1.h[1] ; CHECK-GI-NEXT: mov h9, v1.h[2] -; CHECK-GI-NEXT: mov h15, v1.h[3] -; CHECK-GI-NEXT: mov h14, v1.h[4] +; CHECK-GI-NEXT: mov h10, v1.h[3] +; CHECK-GI-NEXT: mov h15, v1.h[4] ; CHECK-GI-NEXT: str h2, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[6] -; CHECK-GI-NEXT: str h2, [sp, #80] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h2, [sp, #64] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h2, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: str h2, [sp, #96] // 2-byte Folded Spill @@ -1103,27 +1104,27 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #190] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h10 +; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h15 +; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf @@ -1135,10 +1136,10 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: ldr h0, [sp, #188] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 @@ -1151,7 +1152,7 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #32] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload @@ -1166,7 +1167,7 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #96] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v1.h[6], v2.h[0] @@ -1369,27 +1370,27 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: .cfi_offset b13, -64 ; CHECK-GI-NEXT: .cfi_offset b14, -72 ; CHECK-GI-NEXT: .cfi_offset b15, -80 -; CHECK-GI-NEXT: mov h4, v0.h[4] +; CHECK-GI-NEXT: mov v4.16b, v1.16b ; CHECK-GI-NEXT: str q1, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: mov h11, v0.h[1] -; CHECK-GI-NEXT: mov h12, v0.h[2] -; CHECK-GI-NEXT: mov h13, v0.h[3] +; CHECK-GI-NEXT: mov h1, v0.h[4] +; CHECK-GI-NEXT: mov h12, v0.h[1] +; CHECK-GI-NEXT: mov h13, v0.h[2] ; CHECK-GI-NEXT: str q3, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov h14, v0.h[3] ; CHECK-GI-NEXT: mov h15, v2.h[1] ; CHECK-GI-NEXT: mov h8, v2.h[2] ; CHECK-GI-NEXT: mov h9, v2.h[3] ; CHECK-GI-NEXT: mov h10, v2.h[4] -; CHECK-GI-NEXT: mov h14, v2.h[5] -; CHECK-GI-NEXT: str h4, [sp, #288] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[5] -; CHECK-GI-NEXT: str h4, [sp, #240] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[6] -; CHECK-GI-NEXT: str h4, [sp, #176] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov h4, v0.h[7] +; CHECK-GI-NEXT: mov h11, v2.h[5] +; CHECK-GI-NEXT: str h1, [sp, #272] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[5] +; CHECK-GI-NEXT: str h1, [sp, #240] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[6] +; CHECK-GI-NEXT: str h1, [sp, #176] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: str h4, [sp, #144] // 2-byte Folded Spill -; CHECK-GI-NEXT: mov v4.16b, v1.16b -; CHECK-GI-NEXT: mov h1, v1.h[1] +; CHECK-GI-NEXT: str h1, [sp, #144] // 2-byte Folded Spill +; CHECK-GI-NEXT: mov h1, v4.h[1] ; CHECK-GI-NEXT: str h1, [sp, #48] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[2] ; CHECK-GI-NEXT: str h1, [sp, #80] // 2-byte Folded Spill @@ -1400,7 +1401,7 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: mov h1, v4.h[5] ; CHECK-GI-NEXT: str h1, [sp, #256] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[6] -; CHECK-GI-NEXT: str h1, [sp, #320] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #336] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v4.h[7] ; CHECK-GI-NEXT: str h1, [sp, #352] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v2.h[6] @@ -1418,40 +1419,40 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: mov h1, v3.h[5] ; CHECK-GI-NEXT: str h1, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[6] -; CHECK-GI-NEXT: str h1, [sp, #222] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #238] // 2-byte Folded Spill ; CHECK-GI-NEXT: mov h1, v3.h[7] -; CHECK-GI-NEXT: str h1, [sp, #286] // 2-byte Folded Spill +; CHECK-GI-NEXT: str h1, [sp, #302] // 2-byte Folded Spill ; CHECK-GI-NEXT: fcvt s1, h2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 +; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 +; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 -; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h13 +; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 -; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #288] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 ; CHECK-GI-NEXT: fcvt s1, h10 -; CHECK-GI-NEXT: str q0, [sp, #288] // 16-byte Folded Spill +; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h14 +; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: str q0, [sp, #240] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf @@ -1519,11 +1520,11 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #320] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h1, [sp, #336] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #222] // 2-byte Folded Reload +; CHECK-GI-NEXT: str q0, [sp, #336] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr h0, [sp, #238] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf @@ -1531,47 +1532,46 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s2, h1 ; CHECK-GI-NEXT: str q0, [sp, #352] // 16-byte Folded Spill -; CHECK-GI-NEXT: ldr h0, [sp, #286] // 2-byte Folded Reload +; CHECK-GI-NEXT: ldr h0, [sp, #302] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt s1, h0 ; CHECK-GI-NEXT: fmov s0, s2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr q1, [sp, #304] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q2, [sp, #224] // 16-byte Folded Reload -; CHECK-GI-NEXT: ldr q3, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q3, [sp, #304] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q2, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp x29, x30, [sp, #432] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d9, d8, [sp, #416] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[1], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d11, d10, [sp, #400] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d13, d12, [sp, #384] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v3.h[2], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #288] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[2], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #272] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldp d15, d14, [sp, #368] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[3], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[3], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #240] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[4], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[4], v2.h[0] ; CHECK-GI-NEXT: ldr q2, [sp, #176] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [sp, #320] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[5], v2.h[0] +; CHECK-GI-NEXT: ldr q2, [sp, #336] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[5], v2.h[0] ; CHECK-GI-NEXT: fcvt h2, s0 ; CHECK-GI-NEXT: ldr q0, [sp, #144] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] -; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload ; CHECK-GI-NEXT: mov v3.h[6], v0.h[0] +; CHECK-GI-NEXT: ldr q0, [sp, #352] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.h[6], v0.h[0] ; CHECK-GI-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload -; CHECK-GI-NEXT: mov v1.h[7], v0.h[0] -; CHECK-GI-NEXT: mov v3.h[7], v2.h[0] -; CHECK-GI-NEXT: mov v0.16b, v1.16b -; CHECK-GI-NEXT: mov v1.16b, v3.16b +; CHECK-GI-NEXT: mov v3.h[7], v0.h[0] +; CHECK-GI-NEXT: mov v1.h[7], v2.h[0] +; CHECK-GI-NEXT: mov v0.16b, v3.16b ; CHECK-GI-NEXT: add sp, sp, #448 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll index e2cd3835e4499..ed0d0d5194835 100644 --- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll +++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll @@ -272,22 +272,21 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) { ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt s1, h9 ; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt s1, h10 ; GISEL-NEXT: fcvt h0, s0 -; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill ; GISEL-NEXT: fmov s0, s1 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload -; GISEL-NEXT: ldr q2, [sp] // 16-byte Folded Reload +; GISEL-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: ldp d9, d8, [sp, #56] // 16-byte Folded Reload ; GISEL-NEXT: ldr x30, [sp, #72] // 8-byte Folded Reload -; GISEL-NEXT: mov v1.h[1], v2.h[0] -; GISEL-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload ; GISEL-NEXT: ldr d10, [sp, #48] // 8-byte Folded Reload +; GISEL-NEXT: mov v1.h[1], v2.h[0] +; GISEL-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; GISEL-NEXT: mov v1.h[2], v2.h[0] ; GISEL-NEXT: mov v1.h[3], v0.h[0] ; GISEL-NEXT: mov v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll index 1b037c13aa4b5..0241091fae025 100644 --- a/llvm/test/CodeGen/AArch64/neon-addlv.ll +++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll @@ -195,7 +195,6 @@ entry: } declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) - declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64) define <8 x i8> @uaddlv_v8i8_urshr(<8 x i8> %a) { @@ -215,3 +214,36 @@ entry: %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer ret <8 x i8> %vecinit7.i } + +define <4 x i32> @uaddlv_dup_v4i16(<4 x i16> %a) { +; CHECK-LABEL: uaddlv_dup_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv s0, v0.4h +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ushr v0.4s, v0.4s, #3 +; CHECK-NEXT: ret +entry: + %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a) + %vecinit.i = insertelement <4 x i32> undef, i32 %vaddlv.i, i64 0 + %vecinit7.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> poison, <4 x i32> zeroinitializer + %vshr_n = lshr <4 x i32> %vecinit7.i, + ret <4 x i32> %vshr_n +} + +define <4 x i32> @uaddlv_dup_v8i16(<8 x i16> %a) { +; CHECK-LABEL: uaddlv_dup_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uaddlv s0, v0.8h +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ushr v0.4s, v0.4s, #3 +; CHECK-NEXT: ret +entry: + %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a) + %vecinit.i = insertelement <4 x i32> undef, i32 %vaddlv.i, i64 0 + %vecinit7.i = shufflevector <4 x i32> %vecinit.i, <4 x i32> poison, <4 x i32> zeroinitializer + %vshr_n = lshr <4 x i32> %vecinit7.i, + ret <4 x i32> %vshr_n +} + +declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>) +declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>) diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 40a8128857cb7..74048b8bee332 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -671,152 +671,152 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: ldr b3, [sp, #80] ; CHECK-NEXT: mov v0.b[1], w1 ; CHECK-NEXT: ldr b4, [sp, #528] -; CHECK-NEXT: ldr b6, [sp, #656] ; CHECK-NEXT: add x10, sp, #88 ; CHECK-NEXT: ld1 { v2.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #536 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: ldr b5, [sp, #336] -; CHECK-NEXT: ldr b7, [sp, #464] -; CHECK-NEXT: add x12, sp, #664 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: ld1 { v3.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #344 -; CHECK-NEXT: mov v0.b[2], w2 ; CHECK-NEXT: ld1 { v4.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #176 +; CHECK-NEXT: ldr b6, [sp, #656] +; CHECK-NEXT: mov v0.b[2], w2 +; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: ldr b7, [sp, #464] ; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: add x12, sp, #664 ; CHECK-NEXT: add x9, sp, #472 ; CHECK-NEXT: ld1 { v6.b }[1], [x12] -; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: add x8, sp, #96 +; CHECK-NEXT: add x10, sp, #184 ; CHECK-NEXT: add x12, sp, #288 ; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: ld1 { v2.b }[2], [x12] -; CHECK-NEXT: add x8, sp, #96 -; CHECK-NEXT: add x13, sp, #544 +; CHECK-NEXT: ld1 { v3.b }[2], [x8] ; CHECK-NEXT: mov v0.b[3], w3 ; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #672 -; CHECK-NEXT: add x10, sp, #184 -; CHECK-NEXT: ld1 { v3.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #352 -; CHECK-NEXT: ld1 { v4.b }[2], [x13] -; CHECK-NEXT: ld1 { v6.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #480 +; CHECK-NEXT: ld1 { v2.b }[2], [x12] +; CHECK-NEXT: add x13, sp, #544 ; CHECK-NEXT: ld1 { v5.b }[2], [x8] -; CHECK-NEXT: ld1 { v7.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #296 -; CHECK-NEXT: mov v0.b[4], w4 -; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: ld1 { v2.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #552 +; CHECK-NEXT: add x8, sp, #672 +; CHECK-NEXT: ld1 { v4.b }[2], [x13] ; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: add x15, sp, #104 -; CHECK-NEXT: ld1 { v4.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #360 +; CHECK-NEXT: ld1 { v1.b }[5], [x10] +; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #480 +; CHECK-NEXT: mov v0.b[4], w4 +; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #296 +; CHECK-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #552 ; CHECK-NEXT: add x12, sp, #200 -; CHECK-NEXT: ld1 { v5.b }[3], [x11] ; CHECK-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #360 +; CHECK-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: add x9, sp, #560 ; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: ld1 { v3.b }[3], [x15] -; CHECK-NEXT: add x15, sp, #368 +; CHECK-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #368 +; CHECK-NEXT: ld1 { v1.b }[7], [x12] ; CHECK-NEXT: ld1 { v4.b }[4], [x9] ; CHECK-NEXT: add x13, sp, #208 -; CHECK-NEXT: add x8, sp, #216 -; CHECK-NEXT: ld1 { v5.b }[4], [x15] -; CHECK-NEXT: ld1 { v1.b }[7], [x12] -; CHECK-NEXT: add x12, sp, #568 -; CHECK-NEXT: add x14, sp, #224 -; CHECK-NEXT: add x16, sp, #304 -; CHECK-NEXT: add x10, sp, #232 -; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v4.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #376 -; CHECK-NEXT: ld1 { v5.b }[5], [x12] +; CHECK-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-NEXT: add x12, sp, #304 +; CHECK-NEXT: add x8, sp, #568 +; CHECK-NEXT: ld1 { v2.b }[4], [x12] ; CHECK-NEXT: add x12, sp, #16 +; CHECK-NEXT: add x17, sp, #376 +; CHECK-NEXT: mov v0.b[6], w6 ; CHECK-NEXT: ld1 { v1.b }[8], [x13] +; CHECK-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-NEXT: add x14, sp, #216 +; CHECK-NEXT: ld1 { v5.b }[5], [x17] ; CHECK-NEXT: add x13, sp, #576 -; CHECK-NEXT: ld1 { v2.b }[4], [x16] -; CHECK-NEXT: add x11, sp, #240 +; CHECK-NEXT: add x11, sp, #224 +; CHECK-NEXT: add x10, sp, #232 +; CHECK-NEXT: add x15, sp, #240 +; CHECK-NEXT: ld1 { v1.b }[9], [x14] ; CHECK-NEXT: ld1 { v4.b }[6], [x13] ; CHECK-NEXT: add x13, sp, #384 -; CHECK-NEXT: add x9, sp, #248 ; CHECK-NEXT: mov v0.b[7], w7 -; CHECK-NEXT: ld1 { v1.b }[9], [x8] ; CHECK-NEXT: ld1 { v5.b }[6], [x13] ; CHECK-NEXT: add x13, sp, #112 -; CHECK-NEXT: add x8, sp, #584 -; CHECK-NEXT: add x15, sp, #256 ; CHECK-NEXT: ld1 { v3.b }[4], [x13] ; CHECK-NEXT: add x13, sp, #32 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] -; CHECK-NEXT: ld1 { v1.b }[10], [x14] -; CHECK-NEXT: add x14, sp, #312 -; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: add x14, sp, #584 +; CHECK-NEXT: ld1 { v1.b }[10], [x11] +; CHECK-NEXT: ld1 { v4.b }[7], [x14] +; CHECK-NEXT: add x11, sp, #312 +; CHECK-NEXT: add x14, sp, #40 +; CHECK-NEXT: ld1 { v2.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #592 ; CHECK-NEXT: ld1 { v0.b }[8], [x12] ; CHECK-NEXT: add x12, sp, #24 -; CHECK-NEXT: ld1 { v2.b }[5], [x14] -; CHECK-NEXT: add x14, sp, #592 -; CHECK-NEXT: add x16, sp, #264 -; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: add x16, sp, #248 ; CHECK-NEXT: ld1 { v1.b }[11], [x10] -; CHECK-NEXT: ld1 { v4.b }[8], [x14] -; CHECK-NEXT: add x14, sp, #400 +; CHECK-NEXT: ld1 { v4.b }[8], [x11] +; CHECK-NEXT: add x11, sp, #400 +; CHECK-NEXT: add x9, sp, #256 +; CHECK-NEXT: add x8, sp, #264 +; CHECK-NEXT: add x10, sp, #72 ; CHECK-NEXT: ld1 { v0.b }[9], [x12] ; CHECK-NEXT: add x12, sp, #392 -; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: movi v16.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v5.b }[7], [x12] ; CHECK-NEXT: add x12, sp, #48 +; CHECK-NEXT: ld1 { v1.b }[12], [x15] +; CHECK-NEXT: add x15, sp, #120 ; CHECK-NEXT: movi v17.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v1.b }[12], [x11] -; CHECK-NEXT: add x11, sp, #120 ; CHECK-NEXT: movi v18.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v0.b }[10], [x13] -; CHECK-NEXT: ld1 { v3.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #408 -; CHECK-NEXT: ld1 { v5.b }[8], [x14] +; CHECK-NEXT: ld1 { v3.b }[5], [x15] +; CHECK-NEXT: add x15, sp, #408 +; CHECK-NEXT: ld1 { v5.b }[8], [x11] ; CHECK-NEXT: add x13, sp, #56 -; CHECK-NEXT: add x14, sp, #64 -; CHECK-NEXT: ld1 { v1.b }[13], [x9] -; CHECK-NEXT: add x9, sp, #616 +; CHECK-NEXT: ld1 { v1.b }[13], [x16] +; CHECK-NEXT: add x11, sp, #64 +; CHECK-NEXT: add x16, sp, #616 ; CHECK-NEXT: movi v19.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v0.b }[11], [x8] -; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: ld1 { v4.b }[9], [x8] -; CHECK-NEXT: ld1 { v5.b }[9], [x11] -; CHECK-NEXT: add x11, sp, #608 -; CHECK-NEXT: ld1 { v1.b }[14], [x15] -; CHECK-NEXT: add x15, sp, #488 -; CHECK-NEXT: add x8, sp, #320 +; CHECK-NEXT: ld1 { v0.b }[11], [x14] +; CHECK-NEXT: add x14, sp, #600 +; CHECK-NEXT: ld1 { v4.b }[9], [x14] +; CHECK-NEXT: ld1 { v5.b }[9], [x15] +; CHECK-NEXT: add x15, sp, #608 +; CHECK-NEXT: ld1 { v1.b }[14], [x9] +; CHECK-NEXT: add x9, sp, #488 +; CHECK-NEXT: add x14, sp, #320 ; CHECK-NEXT: ld1 { v0.b }[12], [x12] -; CHECK-NEXT: ld1 { v7.b }[3], [x15] -; CHECK-NEXT: ld1 { v2.b }[6], [x8] -; CHECK-NEXT: ld1 { v4.b }[10], [x11] -; CHECK-NEXT: add x8, sp, #624 +; CHECK-NEXT: ld1 { v7.b }[3], [x9] +; CHECK-NEXT: ld1 { v2.b }[6], [x14] +; CHECK-NEXT: ld1 { v4.b }[10], [x15] +; CHECK-NEXT: add x14, sp, #624 +; CHECK-NEXT: add x9, sp, #688 +; CHECK-NEXT: ld1 { v1.b }[15], [x8] +; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: add x11, sp, #128 -; CHECK-NEXT: ld1 { v1.b }[15], [x16] ; CHECK-NEXT: ld1 { v0.b }[13], [x13] ; CHECK-NEXT: add x13, sp, #416 -; CHECK-NEXT: ld1 { v3.b }[6], [x11] +; CHECK-NEXT: ld1 { v2.b }[7], [x12] ; CHECK-NEXT: ld1 { v5.b }[10], [x13] -; CHECK-NEXT: ld1 { v4.b }[11], [x9] -; CHECK-NEXT: add x9, sp, #680 -; CHECK-NEXT: ld1 { v6.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #688 +; CHECK-NEXT: ld1 { v4.b }[11], [x16] +; CHECK-NEXT: add x16, sp, #680 +; CHECK-NEXT: ld1 { v6.b }[3], [x16] ; CHECK-NEXT: add x13, sp, #632 -; CHECK-NEXT: ld1 { v0.b }[14], [x14] -; CHECK-NEXT: add x14, sp, #424 -; CHECK-NEXT: ld1 { v2.b }[7], [x12] -; CHECK-NEXT: ld1 { v5.b }[11], [x14] -; CHECK-NEXT: ld1 { v4.b }[12], [x8] -; CHECK-NEXT: add x8, sp, #432 -; CHECK-NEXT: ld1 { v6.b }[4], [x9] -; CHECK-NEXT: add x11, sp, #696 ; CHECK-NEXT: add x12, sp, #504 +; CHECK-NEXT: ld1 { v0.b }[14], [x11] +; CHECK-NEXT: add x11, sp, #424 +; CHECK-NEXT: add x15, sp, #128 +; CHECK-NEXT: ld1 { v5.b }[11], [x11] +; CHECK-NEXT: ld1 { v4.b }[12], [x14] +; CHECK-NEXT: add x11, sp, #696 +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: ld1 { v3.b }[6], [x15] +; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: ld1 { v0.b }[15], [x10] ; CHECK-NEXT: add x10, sp, #496 -; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: ld1 { v5.b }[12], [x8] ; CHECK-NEXT: ld1 { v7.b }[4], [x10] ; CHECK-NEXT: ld1 { v4.b }[13], [x13] @@ -1105,220 +1105,220 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: fmov s3, w0 +; CHECK-NEXT: fmov s4, w0 ; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b1, [sp, #16] ; CHECK-NEXT: add x10, sp, #24 ; CHECK-NEXT: ldr b2, [sp, #280] ; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: ldr b5, [sp, #152] -; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: mov v3.b[1], w1 +; CHECK-NEXT: ldr b3, [sp, #216] +; CHECK-NEXT: add x11, sp, #224 +; CHECK-NEXT: mov v4.b[1], w1 ; CHECK-NEXT: ld1 { v1.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #288 +; CHECK-NEXT: ldr b5, [sp, #152] +; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: ld1 { v3.b }[1], [x11] ; CHECK-NEXT: add x10, sp, #160 -; CHECK-NEXT: ldr b4, [sp, #216] ; CHECK-NEXT: ld1 { v0.b }[2], [x9] ; CHECK-NEXT: ld1 { v5.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: add x11, sp, #224 -; CHECK-NEXT: ld1 { v1.b }[2], [x10] -; CHECK-NEXT: add x8, sp, #104 -; CHECK-NEXT: mov v3.b[2], w2 -; CHECK-NEXT: ld1 { v4.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #296 -; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: mov v4.b[2], w2 +; CHECK-NEXT: ld1 { v1.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #232 -; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: ld1 { v2.b }[2], [x11] +; CHECK-NEXT: ld1 { v3.b }[2], [x10] ; CHECK-NEXT: add x11, sp, #168 -; CHECK-NEXT: ld1 { v4.b }[2], [x10] +; CHECK-NEXT: ld1 { v0.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: ld1 { v5.b }[2], [x11] -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: mov v3.b[3], w3 +; CHECK-NEXT: ld1 { v1.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #240 -; CHECK-NEXT: add x15, sp, #56 -; CHECK-NEXT: ld1 { v1.b }[4], [x13] -; CHECK-NEXT: add x12, sp, #112 -; CHECK-NEXT: add x11, sp, #304 -; CHECK-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-NEXT: mov v4.b[3], w3 +; CHECK-NEXT: ld1 { v3.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: ld1 { v0.b }[4], [x12] +; CHECK-NEXT: add x12, sp, #112 +; CHECK-NEXT: add x13, sp, #48 +; CHECK-NEXT: add x9, sp, #120 ; CHECK-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-NEXT: ld1 { v0.b }[4], [x12] ; CHECK-NEXT: add x12, sp, #184 -; CHECK-NEXT: ld1 { v2.b }[3], [x11] -; CHECK-NEXT: mov v3.b[4], w4 -; CHECK-NEXT: ld1 { v1.b }[5], [x15] -; CHECK-NEXT: add x11, sp, #64 -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ldr b6, [sp, #352] +; CHECK-NEXT: ld1 { v1.b }[4], [x13] +; CHECK-NEXT: add x15, sp, #56 +; CHECK-NEXT: add x14, sp, #128 +; CHECK-NEXT: mov v4.b[4], w4 +; CHECK-NEXT: add x11, sp, #304 +; CHECK-NEXT: add x13, sp, #256 ; CHECK-NEXT: ld1 { v5.b }[4], [x12] ; CHECK-NEXT: ld1 { v0.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: mov v3.b[5], w5 -; CHECK-NEXT: add x11, sp, #360 -; CHECK-NEXT: ldr b16, [sp, #552] -; CHECK-NEXT: ld1 { v5.b }[5], [x9] -; CHECK-NEXT: ld1 { v6.b }[1], [x11] +; CHECK-NEXT: add x12, sp, #248 +; CHECK-NEXT: ld1 { v1.b }[5], [x15] ; CHECK-NEXT: add x15, sp, #200 -; CHECK-NEXT: ld1 { v1.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #560 -; CHECK-NEXT: add x14, sp, #128 -; CHECK-NEXT: ld1 { v16.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #368 +; CHECK-NEXT: ld1 { v3.b }[4], [x12] +; CHECK-NEXT: ld1 { v2.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #64 +; CHECK-NEXT: mov v4.b[5], w5 +; CHECK-NEXT: ld1 { v5.b }[5], [x9] ; CHECK-NEXT: ld1 { v0.b }[6], [x14] -; CHECK-NEXT: mov v3.b[6], w6 +; CHECK-NEXT: ldr b6, [sp, #352] +; CHECK-NEXT: add x10, sp, #136 +; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #360 +; CHECK-NEXT: ld1 { v3.b }[5], [x13] +; CHECK-NEXT: ldr b18, [sp, #552] ; CHECK-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-NEXT: ld1 { v6.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #568 ; CHECK-NEXT: add x14, sp, #208 -; CHECK-NEXT: ldr b18, [sp, #480] -; CHECK-NEXT: ld1 { v16.b }[2], [x8] -; CHECK-NEXT: ldr b7, [sp, #144] -; CHECK-NEXT: add x11, sp, #488 +; CHECK-NEXT: ld1 { v6.b }[1], [x11] +; CHECK-NEXT: mov v4.b[6], w6 +; CHECK-NEXT: ld1 { v0.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #560 +; CHECK-NEXT: add x9, sp, #264 +; CHECK-NEXT: ld1 { v18.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #568 ; CHECK-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-NEXT: add x8, sp, #376 -; CHECK-NEXT: ld1 { v18.b }[1], [x11] -; CHECK-NEXT: mov v3.b[7], w7 -; CHECK-NEXT: ld1 { v6.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #576 +; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #368 +; CHECK-NEXT: ld1 { v6.b }[2], [x9] +; CHECK-NEXT: add x11, sp, #488 +; CHECK-NEXT: ldr b7, [sp, #144] +; CHECK-NEXT: mov v4.b[7], w7 +; CHECK-NEXT: ld1 { v18.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #376 +; CHECK-NEXT: sshll v17.8h, v5.8b, #0 +; CHECK-NEXT: ldr b5, [sp, #480] ; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[3], [x8] +; CHECK-NEXT: ld1 { v6.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #576 +; CHECK-NEXT: add x8, sp, #312 +; CHECK-NEXT: ld1 { v5.b }[1], [x11] +; CHECK-NEXT: ld1 { v18.b }[3], [x10] ; CHECK-NEXT: add x11, sp, #496 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: add x8, sp, #384 -; CHECK-NEXT: ld1 { v18.b }[2], [x11] -; CHECK-NEXT: ld1 { v6.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #584 +; CHECK-NEXT: sshll v16.8h, v4.8b, #0 +; CHECK-NEXT: ldr b4, [sp, #344] +; CHECK-NEXT: add x10, sp, #384 +; CHECK-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #584 +; CHECK-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-NEXT: sshll v19.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v5.b }[2], [x11] +; CHECK-NEXT: ld1 { v18.b }[4], [x10] +; CHECK-NEXT: smull2 v4.4s, v16.8h, v17.8h +; CHECK-NEXT: smull v16.4s, v16.4h, v17.4h +; CHECK-NEXT: ldr b17, [sp, #416] ; CHECK-NEXT: add x11, sp, #504 -; CHECK-NEXT: sshll v17.8h, v3.8b, #0 -; CHECK-NEXT: ldr b3, [sp, #344] -; CHECK-NEXT: ld1 { v16.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #424 +; CHECK-NEXT: add x10, sp, #424 ; CHECK-NEXT: add x16, sp, #320 -; CHECK-NEXT: ld1 { v18.b }[3], [x11] -; CHECK-NEXT: sshll v19.8h, v3.8b, #0 +; CHECK-NEXT: smull v19.4s, v7.4h, v19.4h +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v5.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #392 +; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #592 ; CHECK-NEXT: ld1 { v2.b }[5], [x16] -; CHECK-NEXT: smull2 v3.4s, v17.8h, v5.8h -; CHECK-NEXT: smull v5.4s, v17.4h, v5.4h -; CHECK-NEXT: movi v17.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v6.b }[5], [x11] -; CHECK-NEXT: add x12, sp, #248 +; CHECK-NEXT: ld1 { v18.b }[5], [x10] ; CHECK-NEXT: add x11, sp, #512 -; CHECK-NEXT: smull v7.4s, v7.4h, v19.4h -; CHECK-NEXT: ldr b19, [sp, #416] -; CHECK-NEXT: ld1 { v4.b }[4], [x12] +; CHECK-NEXT: add x10, sp, #432 ; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: ld1 { v18.b }[4], [x11] +; CHECK-NEXT: mov v7.s[0], v19.s[0] +; CHECK-NEXT: ld1 { v5.b }[4], [x11] ; CHECK-NEXT: add x11, sp, #400 -; CHECK-NEXT: ld1 { v19.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #592 +; CHECK-NEXT: ld1 { v17.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #600 +; CHECK-NEXT: ldr b19, [sp, #680] ; CHECK-NEXT: ldr b20, [sp, #616] -; CHECK-NEXT: ld1 { v16.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: ld1 { v2.b }[6], [x12] -; CHECK-NEXT: mov v17.s[0], v7.s[0] -; CHECK-NEXT: ldr b7, [sp, #680] ; CHECK-NEXT: ld1 { v6.b }[6], [x11] -; CHECK-NEXT: ld1 { v19.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: ld1 { v18.b }[6], [x10] ; CHECK-NEXT: add x11, sp, #688 -; CHECK-NEXT: ld1 { v16.b }[6], [x8] ; CHECK-NEXT: add x12, sp, #624 -; CHECK-NEXT: ld1 { v7.b }[1], [x11] +; CHECK-NEXT: ld1 { v19.b }[1], [x11] ; CHECK-NEXT: ld1 { v20.b }[1], [x12] -; CHECK-NEXT: add x8, sp, #408 +; CHECK-NEXT: add x10, sp, #408 ; CHECK-NEXT: add x11, sp, #608 ; CHECK-NEXT: add x12, sp, #440 -; CHECK-NEXT: ld1 { v6.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #696 -; CHECK-NEXT: ld1 { v16.b }[7], [x11] -; CHECK-NEXT: ld1 { v19.b }[3], [x12] +; CHECK-NEXT: ld1 { v6.b }[7], [x10] +; CHECK-NEXT: ld1 { v18.b }[7], [x11] +; CHECK-NEXT: ld1 { v17.b }[3], [x12] +; CHECK-NEXT: add x10, sp, #696 ; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: ld1 { v19.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #448 ; CHECK-NEXT: ld1 { v20.b }[2], [x11] -; CHECK-NEXT: add x8, sp, #448 ; CHECK-NEXT: add x11, sp, #640 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: add x13, sp, #256 -; CHECK-NEXT: ld1 { v19.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #704 -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-NEXT: ld1 { v7.b }[3], [x8] +; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: ld1 { v19.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #712 +; CHECK-NEXT: add x12, sp, #520 ; CHECK-NEXT: ld1 { v20.b }[3], [x11] -; CHECK-NEXT: add x8, sp, #712 ; CHECK-NEXT: add x11, sp, #648 -; CHECK-NEXT: add x12, sp, #520 -; CHECK-NEXT: ld1 { v4.b }[5], [x13] ; CHECK-NEXT: ldr b21, [sp, #544] -; CHECK-NEXT: smull2 v22.4s, v6.8h, v16.8h -; CHECK-NEXT: smull v6.4s, v6.4h, v16.4h -; CHECK-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-NEXT: smull2 v22.4s, v6.8h, v18.8h +; CHECK-NEXT: smull v6.4s, v6.4h, v18.4h +; CHECK-NEXT: ldr b18, [sp, #744] +; CHECK-NEXT: ld1 { v19.b }[4], [x10] +; CHECK-NEXT: ld1 { v5.b }[5], [x12] +; CHECK-NEXT: add x12, sp, #656 ; CHECK-NEXT: ld1 { v20.b }[4], [x11] ; CHECK-NEXT: add x11, sp, #456 -; CHECK-NEXT: ldr b16, [sp, #744] -; CHECK-NEXT: ld1 { v18.b }[5], [x12] -; CHECK-NEXT: ld1 { v19.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #720 -; CHECK-NEXT: add x12, sp, #656 -; CHECK-NEXT: add x9, sp, #264 -; CHECK-NEXT: ld1 { v7.b }[5], [x11] -; CHECK-NEXT: ld1 { v20.b }[5], [x12] ; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-NEXT: add x8, sp, #528 -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: ld1 { v18.b }[6], [x8] +; CHECK-NEXT: ld1 { v17.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #720 +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: ld1 { v19.b }[5], [x11] +; CHECK-NEXT: add x10, sp, #528 ; CHECK-NEXT: add x11, sp, #464 +; CHECK-NEXT: ld1 { v20.b }[5], [x12] +; CHECK-NEXT: ld1 { v5.b }[6], [x10] ; CHECK-NEXT: add x12, sp, #728 ; CHECK-NEXT: add x13, sp, #664 -; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v19.b }[6], [x11] -; CHECK-NEXT: ld1 { v7.b }[6], [x12] +; CHECK-NEXT: add x8, sp, #72 +; CHECK-NEXT: ld1 { v17.b }[6], [x11] +; CHECK-NEXT: ld1 { v19.b }[6], [x12] +; CHECK-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #336 ; CHECK-NEXT: ld1 { v20.b }[6], [x13] -; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #336 ; CHECK-NEXT: add x9, sp, #272 -; CHECK-NEXT: smull v16.4s, v21.4h, v16.4h +; CHECK-NEXT: smull v18.4s, v21.4h, v18.4h ; CHECK-NEXT: movi v21.2d, #0000000000000000 -; CHECK-NEXT: add x8, sp, #536 -; CHECK-NEXT: ld1 { v2.b }[7], [x10] -; CHECK-NEXT: ld1 { v4.b }[7], [x9] -; CHECK-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-NEXT: add x10, sp, #536 +; CHECK-NEXT: ld1 { v2.b }[7], [x8] +; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: ld1 { v5.b }[7], [x10] ; CHECK-NEXT: add x8, sp, #472 ; CHECK-NEXT: add x9, sp, #736 ; CHECK-NEXT: add x10, sp, #672 -; CHECK-NEXT: ld1 { v19.b }[7], [x8] -; CHECK-NEXT: ld1 { v7.b }[7], [x9] +; CHECK-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-NEXT: ld1 { v19.b }[7], [x9] ; CHECK-NEXT: ld1 { v20.b }[7], [x10] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: mov v21.s[0], v16.s[0] +; CHECK-NEXT: mov v21.s[0], v18.s[0] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: sshll v16.8h, v18.8b, #0 +; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 ; CHECK-NEXT: sshll v18.8h, v19.8b, #0 -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 ; CHECK-NEXT: sshll v19.8h, v20.8b, #0 -; CHECK-NEXT: smlal v5.4s, v0.4h, v2.4h -; CHECK-NEXT: smlal2 v3.4s, v0.8h, v2.8h -; CHECK-NEXT: smlal v17.4s, v1.4h, v4.4h -; CHECK-NEXT: smlal v6.4s, v16.4h, v7.4h -; CHECK-NEXT: smlal2 v22.4s, v16.8h, v7.8h -; CHECK-NEXT: smlal v21.4s, v18.4h, v19.4h -; CHECK-NEXT: smlal2 v3.4s, v1.8h, v4.8h -; CHECK-NEXT: add v0.4s, v5.4s, v17.4s +; CHECK-NEXT: smlal v16.4s, v0.4h, v2.4h +; CHECK-NEXT: smlal2 v4.4s, v0.8h, v2.8h +; CHECK-NEXT: smlal v7.4s, v1.4h, v3.4h +; CHECK-NEXT: smlal v6.4s, v5.4h, v18.4h +; CHECK-NEXT: smlal2 v22.4s, v5.8h, v18.8h +; CHECK-NEXT: smlal v21.4s, v17.4h, v19.4h +; CHECK-NEXT: smlal2 v4.4s, v1.8h, v3.8h +; CHECK-NEXT: add v0.4s, v16.4s, v7.4s ; CHECK-NEXT: add v1.4s, v6.4s, v21.4s -; CHECK-NEXT: smlal2 v22.4s, v18.8h, v19.8h -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: smlal2 v22.4s, v17.8h, v19.8h +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s ; CHECK-NEXT: add v1.4s, v1.4s, v22.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -1860,10 +1860,10 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-NEXT: sshll v23.8h, v16.8b, #0 ; CHECK-NEXT: ld1 { v7.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: ldr b24, [sp, #872] +; CHECK-NEXT: movi v19.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v22.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #528 -; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: add x10, sp, #464 ; CHECK-NEXT: ld1 { v4.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #568 ; CHECK-NEXT: smull2 v18.4s, v20.8h, v23.8h @@ -1878,13 +1878,13 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-NEXT: ldr b23, [sp, #1000] ; CHECK-NEXT: ld1 { v7.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #688 -; CHECK-NEXT: sshll v22.8h, v22.8b, #0 +; CHECK-NEXT: sshll v24.8h, v22.8b, #0 ; CHECK-NEXT: ld1 { v21.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #696 -; CHECK-NEXT: sshll v23.8h, v23.8b, #0 +; CHECK-NEXT: sshll v25.8h, v23.8b, #0 ; CHECK-NEXT: add x8, sp, #536 -; CHECK-NEXT: ldr b25, [sp, #936] -; CHECK-NEXT: add x10, sp, #464 +; CHECK-NEXT: ldr b22, [sp, #872] +; CHECK-NEXT: ldr b23, [sp, #936] ; CHECK-NEXT: ld1 { v4.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #584 ; CHECK-NEXT: ld1 { v17.b }[7], [x10] @@ -1892,110 +1892,110 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #880 ; CHECK-NEXT: add x9, sp, #704 -; CHECK-NEXT: smull v22.4s, v22.4h, v23.4h -; CHECK-NEXT: ldr b23, [sp, #744] -; CHECK-NEXT: ld1 { v24.b }[1], [x8] +; CHECK-NEXT: smull v25.4s, v24.4h, v25.4h +; CHECK-NEXT: ldr b24, [sp, #744] +; CHECK-NEXT: ld1 { v22.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #944 ; CHECK-NEXT: add x10, sp, #888 ; CHECK-NEXT: ld1 { v21.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #752 -; CHECK-NEXT: ld1 { v25.b }[1], [x8] -; CHECK-NEXT: ld1 { v23.b }[1], [x9] +; CHECK-NEXT: ld1 { v23.b }[1], [x8] +; CHECK-NEXT: ld1 { v24.b }[1], [x9] ; CHECK-NEXT: add x8, sp, #712 ; CHECK-NEXT: add x9, sp, #760 -; CHECK-NEXT: ld1 { v24.b }[2], [x10] +; CHECK-NEXT: ld1 { v22.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #952 -; CHECK-NEXT: mov v19.s[0], v22.s[0] -; CHECK-NEXT: ldr b22, [sp, #808] -; CHECK-NEXT: ld1 { v25.b }[2], [x10] +; CHECK-NEXT: mov v19.s[0], v25.s[0] +; CHECK-NEXT: ldr b25, [sp, #808] +; CHECK-NEXT: ld1 { v23.b }[2], [x10] ; CHECK-NEXT: ld1 { v21.b }[5], [x8] -; CHECK-NEXT: ld1 { v23.b }[2], [x9] +; CHECK-NEXT: ld1 { v24.b }[2], [x9] ; CHECK-NEXT: add x8, sp, #816 ; CHECK-NEXT: add x9, sp, #896 -; CHECK-NEXT: ld1 { v22.b }[1], [x8] +; CHECK-NEXT: ld1 { v25.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #960 -; CHECK-NEXT: ld1 { v24.b }[3], [x9] +; CHECK-NEXT: ld1 { v22.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #768 -; CHECK-NEXT: ld1 { v25.b }[3], [x8] +; CHECK-NEXT: ld1 { v23.b }[3], [x8] ; CHECK-NEXT: add x10, sp, #904 -; CHECK-NEXT: ld1 { v23.b }[3], [x9] +; CHECK-NEXT: ld1 { v24.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #824 ; CHECK-NEXT: add x8, sp, #720 -; CHECK-NEXT: ld1 { v22.b }[2], [x9] +; CHECK-NEXT: ld1 { v25.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #968 -; CHECK-NEXT: ld1 { v24.b }[4], [x10] +; CHECK-NEXT: ld1 { v22.b }[4], [x10] ; CHECK-NEXT: add x10, sp, #776 -; CHECK-NEXT: ld1 { v25.b }[4], [x9] +; CHECK-NEXT: ld1 { v23.b }[4], [x9] ; CHECK-NEXT: ld1 { v21.b }[6], [x8] -; CHECK-NEXT: ld1 { v23.b }[4], [x10] +; CHECK-NEXT: ld1 { v24.b }[4], [x10] ; CHECK-NEXT: add x8, sp, #832 ; CHECK-NEXT: add x9, sp, #912 -; CHECK-NEXT: ld1 { v22.b }[3], [x8] +; CHECK-NEXT: ld1 { v25.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #976 -; CHECK-NEXT: ld1 { v24.b }[5], [x9] +; CHECK-NEXT: ld1 { v22.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #784 -; CHECK-NEXT: ld1 { v25.b }[5], [x8] +; CHECK-NEXT: ld1 { v23.b }[5], [x8] ; CHECK-NEXT: add x10, sp, #920 -; CHECK-NEXT: ld1 { v23.b }[5], [x9] +; CHECK-NEXT: ld1 { v24.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #840 ; CHECK-NEXT: add x8, sp, #728 -; CHECK-NEXT: ld1 { v22.b }[4], [x9] +; CHECK-NEXT: ld1 { v25.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #984 -; CHECK-NEXT: ld1 { v24.b }[6], [x10] +; CHECK-NEXT: ld1 { v22.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #792 -; CHECK-NEXT: ld1 { v25.b }[6], [x9] +; CHECK-NEXT: ld1 { v23.b }[6], [x9] ; CHECK-NEXT: ld1 { v21.b }[7], [x8] -; CHECK-NEXT: ld1 { v23.b }[6], [x10] +; CHECK-NEXT: ld1 { v24.b }[6], [x10] ; CHECK-NEXT: add x8, sp, #848 ; CHECK-NEXT: add x9, sp, #928 -; CHECK-NEXT: ld1 { v22.b }[5], [x8] +; CHECK-NEXT: ld1 { v25.b }[5], [x8] ; CHECK-NEXT: add x12, sp, #72 ; CHECK-NEXT: add x8, sp, #992 -; CHECK-NEXT: ld1 { v24.b }[7], [x9] +; CHECK-NEXT: ld1 { v22.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #800 ; CHECK-NEXT: ld1 { v3.b }[7], [x12] -; CHECK-NEXT: ld1 { v25.b }[7], [x8] +; CHECK-NEXT: ld1 { v23.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #592 -; CHECK-NEXT: ld1 { v23.b }[7], [x9] +; CHECK-NEXT: ld1 { v24.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #856 ; CHECK-NEXT: ld1 { v7.b }[6], [x8] ; CHECK-NEXT: add x11, sp, #200 -; CHECK-NEXT: ld1 { v22.b }[6], [x9] +; CHECK-NEXT: ld1 { v25.b }[6], [x9] ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 ; CHECK-NEXT: sshll v5.8h, v5.8b, #0 ; CHECK-NEXT: sshll v4.8h, v4.8b, #0 ; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: sshll v24.8h, v24.8b, #0 -; CHECK-NEXT: sshll v25.8h, v25.8b, #0 -; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: sshll v22.8h, v22.8b, #0 ; CHECK-NEXT: sshll v23.8h, v23.8b, #0 +; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: sshll v24.8h, v24.8b, #0 ; CHECK-NEXT: add x9, sp, #864 ; CHECK-NEXT: ld1 { v2.b }[7], [x11] ; CHECK-NEXT: ld1 { v7.b }[7], [x8] -; CHECK-NEXT: ld1 { v22.b }[7], [x9] +; CHECK-NEXT: ld1 { v25.b }[7], [x9] ; CHECK-NEXT: smull v16.4s, v3.4h, v5.4h ; CHECK-NEXT: smull2 v3.4s, v3.8h, v5.8h -; CHECK-NEXT: smull v5.4s, v21.4h, v25.4h -; CHECK-NEXT: smull2 v21.4s, v21.8h, v25.8h -; CHECK-NEXT: smull2 v25.4s, v20.8h, v24.8h -; CHECK-NEXT: smlal v19.4s, v4.4h, v23.4h +; CHECK-NEXT: smull v5.4s, v21.4h, v23.4h +; CHECK-NEXT: smull2 v21.4s, v21.8h, v23.8h +; CHECK-NEXT: smull2 v23.4s, v20.8h, v22.8h +; CHECK-NEXT: smlal v19.4s, v4.4h, v24.4h ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v17.8h, v17.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: sshll v22.8h, v22.8b, #0 +; CHECK-NEXT: sshll v25.8h, v25.8b, #0 ; CHECK-NEXT: smlal2 v3.4s, v2.8h, v17.8h ; CHECK-NEXT: smlal v16.4s, v2.4h, v17.4h -; CHECK-NEXT: smlal2 v25.4s, v4.8h, v23.8h +; CHECK-NEXT: smlal2 v23.4s, v4.8h, v24.8h ; CHECK-NEXT: smlal2 v18.4s, v0.8h, v1.8h ; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h -; CHECK-NEXT: smlal v19.4s, v20.4h, v24.4h -; CHECK-NEXT: smlal2 v21.4s, v7.8h, v22.8h -; CHECK-NEXT: smlal v5.4s, v7.4h, v22.4h +; CHECK-NEXT: smlal v19.4s, v20.4h, v22.4h +; CHECK-NEXT: smlal2 v21.4s, v7.8h, v25.8h +; CHECK-NEXT: smlal v5.4s, v7.4h, v25.4h ; CHECK-NEXT: add v0.4s, v18.4s, v3.4s ; CHECK-NEXT: add v1.4s, v6.4s, v16.4s -; CHECK-NEXT: add v2.4s, v25.4s, v21.4s +; CHECK-NEXT: add v2.4s, v23.4s, v21.4s ; CHECK-NEXT: add v3.4s, v19.4s, v5.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add v1.4s, v3.4s, v2.4s @@ -2267,14 +2267,14 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ldr b3, [sp, #592] ; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: ldr b4, [sp, #208] +; CHECK-NEXT: ldr b6, [sp, #208] ; CHECK-NEXT: ldr b0, [sp, #336] ; CHECK-NEXT: add x9, sp, #344 ; CHECK-NEXT: ldr b2, [sp, #464] ; CHECK-NEXT: ld1 { v3.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #216 ; CHECK-NEXT: add x10, sp, #624 -; CHECK-NEXT: ld1 { v4.b }[1], [x8] +; CHECK-NEXT: ld1 { v6.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #608 ; CHECK-NEXT: ld1 { v0.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #232 @@ -2282,17 +2282,17 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-NEXT: ldr b7, [sp, #1360] ; CHECK-NEXT: ld1 { v3.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #224 -; CHECK-NEXT: add x12, sp, #376 -; CHECK-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-NEXT: add x11, sp, #648 +; CHECK-NEXT: ld1 { v6.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #616 -; CHECK-NEXT: add x11, sp, #656 +; CHECK-NEXT: add x12, sp, #376 ; CHECK-NEXT: mov v1.b[1], w1 -; CHECK-NEXT: ldr b17, [sp, #976] +; CHECK-NEXT: ldr b16, [sp, #976] ; CHECK-NEXT: add x14, sp, #288 ; CHECK-NEXT: ld1 { v3.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #632 ; CHECK-NEXT: add x15, sp, #408 -; CHECK-NEXT: ld1 { v4.b }[3], [x9] +; CHECK-NEXT: ld1 { v6.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #472 ; CHECK-NEXT: add x13, sp, #696 ; CHECK-NEXT: ld1 { v2.b }[1], [x9] @@ -2301,344 +2301,344 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 ; CHECK-NEXT: ld1 { v3.b }[4], [x10] ; CHECK-NEXT: add x10, sp, #352 ; CHECK-NEXT: mov v1.b[2], w2 -; CHECK-NEXT: ld1 { v4.b }[4], [x9] +; CHECK-NEXT: ld1 { v6.b }[4], [x9] ; CHECK-NEXT: ld1 { v0.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #1368 ; CHECK-NEXT: ld1 { v7.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #248 ; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #648 -; CHECK-NEXT: movi v6.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: add x8, sp, #656 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v6.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #360 ; CHECK-NEXT: mov v1.b[3], w3 ; CHECK-NEXT: ld1 { v0.b }[3], [x10] ; CHECK-NEXT: add x10, sp, #256 -; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v3.b }[6], [x9] ; CHECK-NEXT: add x9, sp, #368 -; CHECK-NEXT: ldr b16, [sp, #720] -; CHECK-NEXT: ld1 { v4.b }[6], [x10] +; CHECK-NEXT: ldr b17, [sp, #720] +; CHECK-NEXT: ld1 { v6.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #984 ; CHECK-NEXT: ld1 { v0.b }[4], [x9] -; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: ld1 { v16.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #664 -; CHECK-NEXT: ld1 { v3.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #264 +; CHECK-NEXT: ld1 { v3.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #264 ; CHECK-NEXT: mov v1.b[4], w4 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: ld1 { v6.b }[7], [x11] ; CHECK-NEXT: add x9, sp, #672 -; CHECK-NEXT: add x8, sp, #680 +; CHECK-NEXT: add x11, sp, #680 ; CHECK-NEXT: ld1 { v0.b }[5], [x12] ; CHECK-NEXT: add x12, sp, #480 ; CHECK-NEXT: ld1 { v2.b }[2], [x12] ; CHECK-NEXT: add x12, sp, #272 -; CHECK-NEXT: ld1 { v3.b }[8], [x11] -; CHECK-NEXT: ld1 { v4.b }[8], [x12] +; CHECK-NEXT: ld1 { v3.b }[8], [x8] +; CHECK-NEXT: ld1 { v6.b }[8], [x12] ; CHECK-NEXT: add x12, sp, #384 ; CHECK-NEXT: mov v1.b[5], w5 ; CHECK-NEXT: ld1 { v0.b }[6], [x12] ; CHECK-NEXT: add x12, sp, #280 -; CHECK-NEXT: add x11, sp, #688 +; CHECK-NEXT: add x8, sp, #688 ; CHECK-NEXT: ld1 { v3.b }[9], [x10] ; CHECK-NEXT: add x10, sp, #1376 ; CHECK-NEXT: ld1 { v7.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #392 -; CHECK-NEXT: ld1 { v4.b }[9], [x12] +; CHECK-NEXT: ld1 { v6.b }[9], [x12] ; CHECK-NEXT: ld1 { v0.b }[7], [x10] ; CHECK-NEXT: mov v1.b[6], w6 ; CHECK-NEXT: add x12, sp, #704 ; CHECK-NEXT: ld1 { v3.b }[10], [x9] ; CHECK-NEXT: add x9, sp, #400 ; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: ld1 { v4.b }[10], [x14] +; CHECK-NEXT: ld1 { v6.b }[10], [x14] ; CHECK-NEXT: add x14, sp, #992 ; CHECK-NEXT: ld1 { v0.b }[8], [x9] -; CHECK-NEXT: ld1 { v17.b }[2], [x14] +; CHECK-NEXT: ld1 { v16.b }[2], [x14] ; CHECK-NEXT: add x14, sp, #296 -; CHECK-NEXT: ld1 { v3.b }[11], [x8] +; CHECK-NEXT: ld1 { v3.b }[11], [x11] ; CHECK-NEXT: add x9, sp, #304 -; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v4.b }[11], [x14] +; CHECK-NEXT: add x11, sp, #312 +; CHECK-NEXT: ld1 { v6.b }[11], [x14] ; CHECK-NEXT: mov v1.b[7], w7 ; CHECK-NEXT: add x14, sp, #320 ; CHECK-NEXT: ld1 { v0.b }[9], [x15] ; CHECK-NEXT: add x15, sp, #328 -; CHECK-NEXT: ld1 { v3.b }[12], [x11] -; CHECK-NEXT: add x11, sp, #416 -; CHECK-NEXT: ld1 { v4.b }[12], [x9] +; CHECK-NEXT: ld1 { v3.b }[12], [x8] +; CHECK-NEXT: add x8, sp, #416 +; CHECK-NEXT: ld1 { v6.b }[12], [x9] ; CHECK-NEXT: add x9, sp, #1384 -; CHECK-NEXT: ld1 { v0.b }[10], [x11] +; CHECK-NEXT: ld1 { v0.b }[10], [x8] ; CHECK-NEXT: ld1 { v7.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #424 ; CHECK-NEXT: ld1 { v3.b }[13], [x13] -; CHECK-NEXT: add x11, sp, #432 +; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: add x13, sp, #440 -; CHECK-NEXT: ld1 { v4.b }[13], [x8] -; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ld1 { v6.b }[13], [x11] +; CHECK-NEXT: add x11, sp, #16 ; CHECK-NEXT: ld1 { v0.b }[11], [x9] ; CHECK-NEXT: add x9, sp, #1000 -; CHECK-NEXT: ld1 { v1.b }[8], [x8] -; CHECK-NEXT: ld1 { v17.b }[3], [x9] +; CHECK-NEXT: ld1 { v1.b }[8], [x11] +; CHECK-NEXT: ld1 { v16.b }[3], [x9] ; CHECK-NEXT: ld1 { v3.b }[14], [x12] ; CHECK-NEXT: add x12, sp, #488 -; CHECK-NEXT: ld1 { v4.b }[14], [x14] +; CHECK-NEXT: ld1 { v6.b }[14], [x14] ; CHECK-NEXT: add x14, sp, #1392 ; CHECK-NEXT: ld1 { v2.b }[3], [x12] ; CHECK-NEXT: ld1 { v7.b }[4], [x14] -; CHECK-NEXT: add x8, sp, #1008 -; CHECK-NEXT: ld1 { v0.b }[12], [x11] -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x11, sp, #1400 -; CHECK-NEXT: add x8, sp, #496 -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #1016 -; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: ld1 { v7.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #1008 +; CHECK-NEXT: ld1 { v0.b }[12], [x8] +; CHECK-NEXT: ld1 { v16.b }[4], [x11] +; CHECK-NEXT: add x8, sp, #1400 ; CHECK-NEXT: ld1 { v3.b }[15], [x10] +; CHECK-NEXT: add x10, sp, #496 +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: ld1 { v6.b }[15], [x15] +; CHECK-NEXT: ld1 { v7.b }[5], [x8] +; CHECK-NEXT: ld1 { v2.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #1016 +; CHECK-NEXT: ld1 { v16.b }[5], [x10] ; CHECK-NEXT: ld1 { v0.b }[13], [x13] -; CHECK-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-NEXT: add x10, sp, #1408 +; CHECK-NEXT: add x8, sp, #1408 ; CHECK-NEXT: ld1 { v1.b }[9], [x9] -; CHECK-NEXT: add x8, sp, #504 -; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: ld1 { v4.b }[15], [x15] -; CHECK-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-NEXT: ld1 { v2.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #1024 -; CHECK-NEXT: ld1 { v17.b }[6], [x8] -; CHECK-NEXT: ld1 { v0.b }[14], [x16] -; CHECK-NEXT: ld1 { v1.b }[10], [x9] -; CHECK-NEXT: add x9, sp, #1416 +; CHECK-NEXT: add x9, sp, #504 ; CHECK-NEXT: add x10, sp, #512 -; CHECK-NEXT: add x8, sp, #456 -; CHECK-NEXT: ld1 { v7.b }[7], [x9] +; CHECK-NEXT: ld1 { v7.b }[6], [x8] +; CHECK-NEXT: ld1 { v2.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #1024 +; CHECK-NEXT: add x8, sp, #32 +; CHECK-NEXT: ld1 { v16.b }[6], [x9] +; CHECK-NEXT: ld1 { v0.b }[14], [x16] +; CHECK-NEXT: ld1 { v1.b }[10], [x8] +; CHECK-NEXT: add x8, sp, #1416 +; CHECK-NEXT: add x9, sp, #456 +; CHECK-NEXT: ld1 { v7.b }[7], [x8] ; CHECK-NEXT: ld1 { v2.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #1032 -; CHECK-NEXT: add x9, sp, #40 -; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: ld1 { v0.b }[15], [x8] -; CHECK-NEXT: ld1 { v1.b }[11], [x9] -; CHECK-NEXT: add x9, sp, #1424 -; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: ld1 { v7.b }[8], [x9] -; CHECK-NEXT: ld1 { v2.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #1040 -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: ld1 { v17.b }[8], [x8] +; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: ld1 { v16.b }[7], [x10] +; CHECK-NEXT: ld1 { v0.b }[15], [x9] +; CHECK-NEXT: ld1 { v1.b }[11], [x8] +; CHECK-NEXT: add x8, sp, #1424 +; CHECK-NEXT: add x9, sp, #520 +; CHECK-NEXT: ld1 { v7.b }[8], [x8] +; CHECK-NEXT: ld1 { v2.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #1040 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ld1 { v16.b }[8], [x9] ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: ld1 { v1.b }[12], [x9] -; CHECK-NEXT: add x9, sp, #1432 -; CHECK-NEXT: sdot v6.4s, v4.16b, v3.16b -; CHECK-NEXT: ld1 { v7.b }[9], [x9] +; CHECK-NEXT: ld1 { v1.b }[12], [x8] +; CHECK-NEXT: add x8, sp, #1432 +; CHECK-NEXT: sdot v5.4s, v6.16b, v3.16b +; CHECK-NEXT: ld1 { v7.b }[9], [x8] ; CHECK-NEXT: ld1 { v2.b }[8], [x10] -; CHECK-NEXT: add x9, sp, #1048 +; CHECK-NEXT: add x8, sp, #1048 ; CHECK-NEXT: ldr b3, [sp, #80] -; CHECK-NEXT: ld1 { v17.b }[9], [x9] -; CHECK-NEXT: add x8, sp, #56 +; CHECK-NEXT: ld1 { v16.b }[9], [x8] ; CHECK-NEXT: add x10, sp, #88 -; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: add x8, sp, #536 ; CHECK-NEXT: add x11, sp, #1440 +; CHECK-NEXT: add x9, sp, #56 ; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: ld1 { v1.b }[13], [x8] -; CHECK-NEXT: ld1 { v2.b }[9], [x9] +; CHECK-NEXT: ld1 { v2.b }[9], [x8] ; CHECK-NEXT: add x8, sp, #1056 ; CHECK-NEXT: ld1 { v7.b }[10], [x11] +; CHECK-NEXT: ld1 { v16.b }[10], [x8] +; CHECK-NEXT: ld1 { v1.b }[13], [x9] ; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ld1 { v17.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #544 ; CHECK-NEXT: add x10, sp, #1448 ; CHECK-NEXT: ld1 { v3.b }[2], [x9] ; CHECK-NEXT: ld1 { v2.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #1064 ; CHECK-NEXT: ld1 { v7.b }[11], [x10] +; CHECK-NEXT: ld1 { v16.b }[11], [x8] ; CHECK-NEXT: add x10, sp, #104 -; CHECK-NEXT: add x11, sp, #1456 -; CHECK-NEXT: ld1 { v17.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #552 +; CHECK-NEXT: add x11, sp, #1456 ; CHECK-NEXT: add x9, sp, #64 ; CHECK-NEXT: ld1 { v3.b }[3], [x10] ; CHECK-NEXT: ld1 { v2.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #1072 ; CHECK-NEXT: ld1 { v7.b }[12], [x11] +; CHECK-NEXT: ld1 { v16.b }[12], [x8] ; CHECK-NEXT: ld1 { v1.b }[14], [x9] ; CHECK-NEXT: add x9, sp, #112 -; CHECK-NEXT: ld1 { v17.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #560 ; CHECK-NEXT: add x10, sp, #1464 ; CHECK-NEXT: ld1 { v3.b }[4], [x9] ; CHECK-NEXT: ld1 { v2.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #1080 ; CHECK-NEXT: ld1 { v7.b }[13], [x10] +; CHECK-NEXT: ld1 { v16.b }[13], [x8] ; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: add x11, sp, #1472 -; CHECK-NEXT: ld1 { v17.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #568 +; CHECK-NEXT: add x11, sp, #1472 ; CHECK-NEXT: add x9, sp, #72 ; CHECK-NEXT: ld1 { v3.b }[5], [x10] ; CHECK-NEXT: ld1 { v2.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #1088 ; CHECK-NEXT: ld1 { v7.b }[14], [x11] +; CHECK-NEXT: ld1 { v16.b }[14], [x8] ; CHECK-NEXT: ld1 { v1.b }[15], [x9] ; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: ld1 { v17.b }[14], [x8] -; CHECK-NEXT: ldr b4, [sp, #1104] +; CHECK-NEXT: ldr b6, [sp, #1104] ; CHECK-NEXT: add x10, sp, #1480 ; CHECK-NEXT: ld1 { v3.b }[6], [x9] ; CHECK-NEXT: add x8, sp, #1096 ; CHECK-NEXT: add x9, sp, #1112 ; CHECK-NEXT: ld1 { v7.b }[15], [x10] -; CHECK-NEXT: ld1 { v4.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: ld1 { v17.b }[15], [x8] +; CHECK-NEXT: ld1 { v16.b }[15], [x8] +; CHECK-NEXT: ld1 { v6.b }[1], [x9] ; CHECK-NEXT: add x8, sp, #728 +; CHECK-NEXT: add x9, sp, #576 ; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v16.b }[1], [x8] +; CHECK-NEXT: ld1 { v17.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #1120 ; CHECK-NEXT: ld1 { v2.b }[14], [x9] -; CHECK-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-NEXT: sdot v4.4s, v16.16b, v7.16b +; CHECK-NEXT: ld1 { v6.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #736 -; CHECK-NEXT: ld1 { v3.b }[7], [x10] -; CHECK-NEXT: sdot v5.4s, v17.16b, v7.16b ; CHECK-NEXT: ldr b7, [sp, #1232] -; CHECK-NEXT: ldr b17, [sp, #848] -; CHECK-NEXT: ld1 { v16.b }[2], [x8] +; CHECK-NEXT: ldr b16, [sp, #848] +; CHECK-NEXT: ld1 { v3.b }[7], [x10] +; CHECK-NEXT: ld1 { v17.b }[2], [x8] ; CHECK-NEXT: add x9, sp, #1240 ; CHECK-NEXT: add x10, sp, #856 ; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: ld1 { v16.b }[1], [x10] ; CHECK-NEXT: add x8, sp, #1128 ; CHECK-NEXT: add x11, sp, #744 -; CHECK-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-NEXT: ld1 { v6.b }[3], [x8] ; CHECK-NEXT: add x10, sp, #1248 -; CHECK-NEXT: ld1 { v16.b }[3], [x11] +; CHECK-NEXT: ld1 { v17.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #864 ; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: ld1 { v7.b }[2], [x10] -; CHECK-NEXT: ld1 { v17.b }[2], [x11] +; CHECK-NEXT: ld1 { v16.b }[2], [x11] ; CHECK-NEXT: add x8, sp, #1136 ; CHECK-NEXT: add x12, sp, #752 ; CHECK-NEXT: ld1 { v3.b }[8], [x9] -; CHECK-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-NEXT: ld1 { v16.b }[4], [x12] +; CHECK-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-NEXT: ld1 { v17.b }[4], [x12] ; CHECK-NEXT: add x9, sp, #1256 ; CHECK-NEXT: add x10, sp, #872 ; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: ld1 { v17.b }[3], [x10] +; CHECK-NEXT: ld1 { v16.b }[3], [x10] ; CHECK-NEXT: add x8, sp, #1144 ; CHECK-NEXT: add x11, sp, #760 -; CHECK-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-NEXT: ld1 { v6.b }[5], [x8] ; CHECK-NEXT: add x10, sp, #1264 -; CHECK-NEXT: ld1 { v16.b }[5], [x11] +; CHECK-NEXT: ld1 { v17.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #880 ; CHECK-NEXT: add x9, sp, #152 ; CHECK-NEXT: ld1 { v7.b }[4], [x10] -; CHECK-NEXT: ld1 { v17.b }[4], [x11] +; CHECK-NEXT: ld1 { v16.b }[4], [x11] ; CHECK-NEXT: add x8, sp, #1152 ; CHECK-NEXT: add x12, sp, #768 ; CHECK-NEXT: ld1 { v3.b }[9], [x9] -; CHECK-NEXT: ld1 { v4.b }[6], [x8] -; CHECK-NEXT: ld1 { v16.b }[6], [x12] +; CHECK-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-NEXT: ld1 { v17.b }[6], [x12] ; CHECK-NEXT: add x9, sp, #1272 ; CHECK-NEXT: add x10, sp, #888 ; CHECK-NEXT: ld1 { v7.b }[5], [x9] -; CHECK-NEXT: ld1 { v17.b }[5], [x10] +; CHECK-NEXT: ld1 { v16.b }[5], [x10] ; CHECK-NEXT: add x8, sp, #1160 ; CHECK-NEXT: add x11, sp, #776 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: ld1 { v6.b }[7], [x8] ; CHECK-NEXT: add x10, sp, #1280 -; CHECK-NEXT: ld1 { v16.b }[7], [x11] +; CHECK-NEXT: ld1 { v17.b }[7], [x11] ; CHECK-NEXT: add x11, sp, #896 ; CHECK-NEXT: add x9, sp, #160 ; CHECK-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-NEXT: ld1 { v17.b }[6], [x11] +; CHECK-NEXT: ld1 { v16.b }[6], [x11] ; CHECK-NEXT: add x8, sp, #1168 ; CHECK-NEXT: add x12, sp, #784 ; CHECK-NEXT: ld1 { v3.b }[10], [x9] -; CHECK-NEXT: ld1 { v4.b }[8], [x8] -; CHECK-NEXT: ld1 { v16.b }[8], [x12] +; CHECK-NEXT: ld1 { v6.b }[8], [x8] +; CHECK-NEXT: ld1 { v17.b }[8], [x12] ; CHECK-NEXT: add x9, sp, #1288 ; CHECK-NEXT: add x10, sp, #904 ; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: ld1 { v17.b }[7], [x10] +; CHECK-NEXT: ld1 { v16.b }[7], [x10] ; CHECK-NEXT: add x8, sp, #1176 ; CHECK-NEXT: add x11, sp, #792 -; CHECK-NEXT: ld1 { v4.b }[9], [x8] +; CHECK-NEXT: ld1 { v6.b }[9], [x8] ; CHECK-NEXT: add x10, sp, #1296 -; CHECK-NEXT: ld1 { v16.b }[9], [x11] +; CHECK-NEXT: ld1 { v17.b }[9], [x11] ; CHECK-NEXT: add x11, sp, #912 ; CHECK-NEXT: add x9, sp, #168 ; CHECK-NEXT: ld1 { v7.b }[8], [x10] -; CHECK-NEXT: ld1 { v17.b }[8], [x11] +; CHECK-NEXT: ld1 { v16.b }[8], [x11] ; CHECK-NEXT: add x8, sp, #1184 ; CHECK-NEXT: add x12, sp, #800 ; CHECK-NEXT: ld1 { v3.b }[11], [x9] -; CHECK-NEXT: ld1 { v4.b }[10], [x8] -; CHECK-NEXT: ld1 { v16.b }[10], [x12] +; CHECK-NEXT: ld1 { v6.b }[10], [x8] +; CHECK-NEXT: ld1 { v17.b }[10], [x12] ; CHECK-NEXT: add x9, sp, #1304 ; CHECK-NEXT: add x10, sp, #920 ; CHECK-NEXT: ld1 { v7.b }[9], [x9] -; CHECK-NEXT: ld1 { v17.b }[9], [x10] +; CHECK-NEXT: ld1 { v16.b }[9], [x10] ; CHECK-NEXT: add x8, sp, #1192 ; CHECK-NEXT: add x11, sp, #808 -; CHECK-NEXT: ld1 { v4.b }[11], [x8] +; CHECK-NEXT: ld1 { v6.b }[11], [x8] ; CHECK-NEXT: add x10, sp, #1312 -; CHECK-NEXT: ld1 { v16.b }[11], [x11] +; CHECK-NEXT: ld1 { v17.b }[11], [x11] ; CHECK-NEXT: add x11, sp, #928 ; CHECK-NEXT: add x9, sp, #176 ; CHECK-NEXT: ld1 { v7.b }[10], [x10] -; CHECK-NEXT: ld1 { v17.b }[10], [x11] +; CHECK-NEXT: ld1 { v16.b }[10], [x11] ; CHECK-NEXT: add x8, sp, #1200 ; CHECK-NEXT: add x12, sp, #816 ; CHECK-NEXT: ld1 { v3.b }[12], [x9] -; CHECK-NEXT: ld1 { v4.b }[12], [x8] -; CHECK-NEXT: ld1 { v16.b }[12], [x12] +; CHECK-NEXT: ld1 { v6.b }[12], [x8] +; CHECK-NEXT: ld1 { v17.b }[12], [x12] ; CHECK-NEXT: add x9, sp, #1320 ; CHECK-NEXT: add x10, sp, #936 ; CHECK-NEXT: ld1 { v7.b }[11], [x9] -; CHECK-NEXT: ld1 { v17.b }[11], [x10] +; CHECK-NEXT: ld1 { v16.b }[11], [x10] ; CHECK-NEXT: add x8, sp, #1208 ; CHECK-NEXT: add x11, sp, #824 -; CHECK-NEXT: ld1 { v4.b }[13], [x8] +; CHECK-NEXT: ld1 { v6.b }[13], [x8] ; CHECK-NEXT: add x10, sp, #1328 -; CHECK-NEXT: ld1 { v16.b }[13], [x11] +; CHECK-NEXT: ld1 { v17.b }[13], [x11] ; CHECK-NEXT: add x11, sp, #944 ; CHECK-NEXT: add x9, sp, #184 ; CHECK-NEXT: ld1 { v7.b }[12], [x10] -; CHECK-NEXT: ld1 { v17.b }[12], [x11] +; CHECK-NEXT: ld1 { v16.b }[12], [x11] ; CHECK-NEXT: add x8, sp, #1216 ; CHECK-NEXT: add x12, sp, #832 ; CHECK-NEXT: ld1 { v3.b }[13], [x9] -; CHECK-NEXT: ld1 { v4.b }[14], [x8] -; CHECK-NEXT: ld1 { v16.b }[14], [x12] +; CHECK-NEXT: ld1 { v6.b }[14], [x8] +; CHECK-NEXT: ld1 { v17.b }[14], [x12] ; CHECK-NEXT: add x9, sp, #1336 ; CHECK-NEXT: add x10, sp, #952 ; CHECK-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-NEXT: ld1 { v17.b }[13], [x10] +; CHECK-NEXT: ld1 { v16.b }[13], [x10] ; CHECK-NEXT: add x8, sp, #1224 ; CHECK-NEXT: add x11, sp, #840 -; CHECK-NEXT: ld1 { v4.b }[15], [x8] +; CHECK-NEXT: ld1 { v6.b }[15], [x8] ; CHECK-NEXT: add x8, sp, #192 -; CHECK-NEXT: ld1 { v16.b }[15], [x11] +; CHECK-NEXT: ld1 { v17.b }[15], [x11] ; CHECK-NEXT: add x10, sp, #1344 ; CHECK-NEXT: add x11, sp, #960 ; CHECK-NEXT: ld1 { v3.b }[14], [x8] ; CHECK-NEXT: ld1 { v7.b }[14], [x10] -; CHECK-NEXT: ld1 { v17.b }[14], [x11] +; CHECK-NEXT: ld1 { v16.b }[14], [x11] ; CHECK-NEXT: add x9, sp, #584 -; CHECK-NEXT: sdot v6.4s, v1.16b, v0.16b +; CHECK-NEXT: sdot v5.4s, v1.16b, v0.16b ; CHECK-NEXT: add x8, sp, #200 -; CHECK-NEXT: sdot v5.4s, v16.16b, v4.16b +; CHECK-NEXT: sdot v4.4s, v17.16b, v6.16b ; CHECK-NEXT: ld1 { v2.b }[15], [x9] ; CHECK-NEXT: add x9, sp, #1352 ; CHECK-NEXT: add x10, sp, #968 ; CHECK-NEXT: ld1 { v3.b }[15], [x8] ; CHECK-NEXT: ld1 { v7.b }[15], [x9] -; CHECK-NEXT: ld1 { v17.b }[15], [x10] -; CHECK-NEXT: sdot v6.4s, v3.16b, v2.16b -; CHECK-NEXT: sdot v5.4s, v17.16b, v7.16b -; CHECK-NEXT: add v0.4s, v6.4s, v5.4s +; CHECK-NEXT: ld1 { v16.b }[15], [x10] +; CHECK-NEXT: sdot v5.4s, v3.16b, v2.16b +; CHECK-NEXT: sdot v4.4s, v16.16b, v7.16b +; CHECK-NEXT: add v0.4s, v5.4s, v4.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2662,195 +2662,195 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr b1, [sp, #208] +; CHECK-NEXT: ldr b5, [sp, #208] ; CHECK-NEXT: add x8, sp, #216 ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldr b5, [sp, #976] +; CHECK-NEXT: ldr b4, [sp, #976] ; CHECK-NEXT: add x9, sp, #984 ; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: ld1 { v5.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #224 -; CHECK-NEXT: movi v2.16b, #1 +; CHECK-NEXT: movi v1.16b, #1 ; CHECK-NEXT: mov v0.b[1], w1 -; CHECK-NEXT: ld1 { v5.b }[1], [x9] -; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v4.b }[1], [x9] +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: add x11, sp, #992 ; CHECK-NEXT: ldr b6, [sp, #720] ; CHECK-NEXT: ldr b7, [sp, #80] -; CHECK-NEXT: ld1 { v1.b }[2], [x8] +; CHECK-NEXT: ld1 { v5.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #232 ; CHECK-NEXT: add x13, sp, #88 -; CHECK-NEXT: ld1 { v5.b }[2], [x11] +; CHECK-NEXT: ld1 { v4.b }[2], [x11] ; CHECK-NEXT: ld1 { v7.b }[1], [x13] ; CHECK-NEXT: add x13, sp, #856 ; CHECK-NEXT: mov v0.b[2], w2 -; CHECK-NEXT: add x14, sp, #744 +; CHECK-NEXT: add x14, sp, #1008 ; CHECK-NEXT: add x15, sp, #872 -; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: ld1 { v5.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #240 ; CHECK-NEXT: add x16, sp, #888 ; CHECK-NEXT: add x10, sp, #16 ; CHECK-NEXT: add x9, sp, #24 ; CHECK-NEXT: add x11, sp, #40 -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v5.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #248 ; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v1.b }[5], [x8] +; CHECK-NEXT: ld1 { v5.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #256 ; CHECK-NEXT: mov v0.b[4], w4 -; CHECK-NEXT: ld1 { v1.b }[6], [x8] +; CHECK-NEXT: ld1 { v5.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #264 ; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-NEXT: ld1 { v5.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #272 -; CHECK-NEXT: ld1 { v1.b }[8], [x8] +; CHECK-NEXT: ld1 { v5.b }[8], [x8] ; CHECK-NEXT: add x8, sp, #280 ; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v1.b }[9], [x8] +; CHECK-NEXT: ld1 { v5.b }[9], [x8] ; CHECK-NEXT: add x8, sp, #288 ; CHECK-NEXT: mov v0.b[7], w7 -; CHECK-NEXT: ld1 { v1.b }[10], [x8] +; CHECK-NEXT: ld1 { v5.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #296 ; CHECK-NEXT: ld1 { v0.b }[8], [x10] ; CHECK-NEXT: add x10, sp, #128 -; CHECK-NEXT: ld1 { v1.b }[11], [x8] +; CHECK-NEXT: ld1 { v5.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #304 ; CHECK-NEXT: ld1 { v0.b }[9], [x9] ; CHECK-NEXT: add x9, sp, #136 -; CHECK-NEXT: ld1 { v1.b }[12], [x8] +; CHECK-NEXT: ld1 { v5.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v1.b }[13], [x8] +; CHECK-NEXT: ld1 { v5.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #320 -; CHECK-NEXT: ld1 { v1.b }[14], [x8] +; CHECK-NEXT: ld1 { v5.b }[14], [x8] ; CHECK-NEXT: add x8, sp, #32 ; CHECK-NEXT: ld1 { v0.b }[10], [x8] ; CHECK-NEXT: add x8, sp, #144 -; CHECK-NEXT: ld1 { v1.b }[15], [x12] +; CHECK-NEXT: ld1 { v5.b }[15], [x12] ; CHECK-NEXT: add x12, sp, #728 ; CHECK-NEXT: ld1 { v6.b }[1], [x12] ; CHECK-NEXT: add x12, sp, #1000 ; CHECK-NEXT: ld1 { v0.b }[11], [x11] -; CHECK-NEXT: ld1 { v5.b }[3], [x12] +; CHECK-NEXT: ld1 { v4.b }[3], [x12] ; CHECK-NEXT: add x12, sp, #736 ; CHECK-NEXT: add x11, sp, #920 -; CHECK-NEXT: sdot v4.4s, v1.16b, v2.16b -; CHECK-NEXT: ldr b1, [sp, #848] +; CHECK-NEXT: sdot v3.4s, v5.16b, v1.16b +; CHECK-NEXT: ldr b5, [sp, #848] ; CHECK-NEXT: ld1 { v6.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #1008 -; CHECK-NEXT: ld1 { v1.b }[1], [x13] -; CHECK-NEXT: ld1 { v5.b }[4], [x12] -; CHECK-NEXT: add x12, sp, #96 -; CHECK-NEXT: ld1 { v7.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #1016 -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: ld1 { v6.b }[3], [x14] -; CHECK-NEXT: add x14, sp, #864 -; CHECK-NEXT: ld1 { v0.b }[12], [x13] -; CHECK-NEXT: ld1 { v1.b }[2], [x14] -; CHECK-NEXT: add x14, sp, #752 -; CHECK-NEXT: ld1 { v5.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #104 -; CHECK-NEXT: ld1 { v6.b }[4], [x14] -; CHECK-NEXT: add x14, sp, #1024 -; CHECK-NEXT: ld1 { v7.b }[3], [x12] -; CHECK-NEXT: ld1 { v1.b }[3], [x15] +; CHECK-NEXT: add x12, sp, #48 +; CHECK-NEXT: ld1 { v5.b }[1], [x13] +; CHECK-NEXT: add x13, sp, #744 +; CHECK-NEXT: ld1 { v4.b }[4], [x14] +; CHECK-NEXT: add x14, sp, #96 +; CHECK-NEXT: ld1 { v0.b }[12], [x12] +; CHECK-NEXT: ld1 { v6.b }[3], [x13] +; CHECK-NEXT: add x13, sp, #864 +; CHECK-NEXT: ld1 { v7.b }[2], [x14] +; CHECK-NEXT: add x14, sp, #1016 +; CHECK-NEXT: ld1 { v5.b }[2], [x13] +; CHECK-NEXT: add x13, sp, #752 +; CHECK-NEXT: ld1 { v4.b }[5], [x14] +; CHECK-NEXT: add x14, sp, #104 +; CHECK-NEXT: ld1 { v6.b }[4], [x13] +; CHECK-NEXT: add x13, sp, #1024 +; CHECK-NEXT: ld1 { v7.b }[3], [x14] +; CHECK-NEXT: ld1 { v5.b }[3], [x15] ; CHECK-NEXT: add x15, sp, #760 -; CHECK-NEXT: ld1 { v5.b }[6], [x14] -; CHECK-NEXT: add x12, sp, #112 -; CHECK-NEXT: add x14, sp, #880 +; CHECK-NEXT: add x14, sp, #112 +; CHECK-NEXT: ld1 { v4.b }[6], [x13] +; CHECK-NEXT: add x13, sp, #880 ; CHECK-NEXT: ld1 { v6.b }[5], [x15] ; CHECK-NEXT: add x15, sp, #1032 -; CHECK-NEXT: ld1 { v7.b }[4], [x12] -; CHECK-NEXT: ld1 { v1.b }[4], [x14] +; CHECK-NEXT: ld1 { v7.b }[4], [x14] +; CHECK-NEXT: ld1 { v5.b }[4], [x13] ; CHECK-NEXT: add x14, sp, #768 -; CHECK-NEXT: ld1 { v5.b }[7], [x15] -; CHECK-NEXT: add x12, sp, #120 +; CHECK-NEXT: add x13, sp, #120 +; CHECK-NEXT: ld1 { v4.b }[7], [x15] ; CHECK-NEXT: add x15, sp, #1040 ; CHECK-NEXT: ld1 { v6.b }[6], [x14] -; CHECK-NEXT: ld1 { v7.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #776 -; CHECK-NEXT: ld1 { v1.b }[5], [x16] -; CHECK-NEXT: ld1 { v5.b }[8], [x15] -; CHECK-NEXT: add x15, sp, #896 +; CHECK-NEXT: ld1 { v7.b }[5], [x13] +; CHECK-NEXT: add x13, sp, #776 +; CHECK-NEXT: ld1 { v5.b }[5], [x16] ; CHECK-NEXT: add x14, sp, #1048 -; CHECK-NEXT: ld1 { v6.b }[7], [x12] +; CHECK-NEXT: ld1 { v4.b }[8], [x15] +; CHECK-NEXT: add x15, sp, #896 +; CHECK-NEXT: ld1 { v6.b }[7], [x13] ; CHECK-NEXT: ld1 { v7.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #784 -; CHECK-NEXT: ld1 { v1.b }[6], [x15] -; CHECK-NEXT: ld1 { v5.b }[9], [x14] +; CHECK-NEXT: ld1 { v5.b }[6], [x15] +; CHECK-NEXT: add x13, sp, #1056 +; CHECK-NEXT: ld1 { v4.b }[9], [x14] ; CHECK-NEXT: add x14, sp, #904 -; CHECK-NEXT: add x12, sp, #1056 ; CHECK-NEXT: ld1 { v6.b }[8], [x10] ; CHECK-NEXT: ld1 { v7.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #792 -; CHECK-NEXT: ld1 { v1.b }[7], [x14] -; CHECK-NEXT: ld1 { v5.b }[10], [x12] -; CHECK-NEXT: add x12, sp, #912 +; CHECK-NEXT: ld1 { v5.b }[7], [x14] ; CHECK-NEXT: add x10, sp, #1064 +; CHECK-NEXT: ld1 { v4.b }[10], [x13] +; CHECK-NEXT: add x13, sp, #912 ; CHECK-NEXT: ld1 { v6.b }[9], [x9] ; CHECK-NEXT: ld1 { v7.b }[8], [x8] ; CHECK-NEXT: add x9, sp, #800 -; CHECK-NEXT: ld1 { v1.b }[8], [x12] -; CHECK-NEXT: ld1 { v5.b }[11], [x10] +; CHECK-NEXT: ld1 { v5.b }[8], [x13] ; CHECK-NEXT: add x8, sp, #152 +; CHECK-NEXT: ld1 { v4.b }[11], [x10] ; CHECK-NEXT: add x10, sp, #1072 ; CHECK-NEXT: ld1 { v6.b }[10], [x9] ; CHECK-NEXT: ld1 { v7.b }[9], [x8] ; CHECK-NEXT: add x9, sp, #808 -; CHECK-NEXT: ld1 { v1.b }[9], [x11] -; CHECK-NEXT: ld1 { v5.b }[12], [x10] -; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ld1 { v5.b }[9], [x11] ; CHECK-NEXT: add x8, sp, #56 +; CHECK-NEXT: ld1 { v4.b }[12], [x10] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ld1 { v0.b }[13], [x8] ; CHECK-NEXT: ld1 { v6.b }[11], [x9] ; CHECK-NEXT: add x9, sp, #928 ; CHECK-NEXT: ld1 { v7.b }[10], [x10] ; CHECK-NEXT: add x10, sp, #1080 -; CHECK-NEXT: ld1 { v1.b }[10], [x9] -; CHECK-NEXT: ld1 { v0.b }[13], [x8] -; CHECK-NEXT: ld1 { v5.b }[13], [x10] +; CHECK-NEXT: ld1 { v5.b }[10], [x9] ; CHECK-NEXT: add x8, sp, #816 +; CHECK-NEXT: ld1 { v4.b }[13], [x10] ; CHECK-NEXT: add x9, sp, #168 +; CHECK-NEXT: add x10, sp, #176 ; CHECK-NEXT: ld1 { v6.b }[12], [x8] ; CHECK-NEXT: add x8, sp, #936 ; CHECK-NEXT: ld1 { v7.b }[11], [x9] ; CHECK-NEXT: add x9, sp, #1088 -; CHECK-NEXT: ld1 { v1.b }[11], [x8] -; CHECK-NEXT: add x10, sp, #176 -; CHECK-NEXT: ld1 { v5.b }[14], [x9] -; CHECK-NEXT: add x9, sp, #824 +; CHECK-NEXT: ld1 { v5.b }[11], [x8] ; CHECK-NEXT: add x8, sp, #64 +; CHECK-NEXT: ld1 { v4.b }[14], [x9] +; CHECK-NEXT: add x9, sp, #824 +; CHECK-NEXT: ld1 { v0.b }[14], [x8] ; CHECK-NEXT: ld1 { v6.b }[13], [x9] ; CHECK-NEXT: add x9, sp, #944 ; CHECK-NEXT: ld1 { v7.b }[12], [x10] ; CHECK-NEXT: add x10, sp, #1096 -; CHECK-NEXT: ld1 { v1.b }[12], [x9] -; CHECK-NEXT: ld1 { v0.b }[14], [x8] -; CHECK-NEXT: ld1 { v5.b }[15], [x10] +; CHECK-NEXT: ld1 { v5.b }[12], [x9] ; CHECK-NEXT: add x8, sp, #832 +; CHECK-NEXT: ld1 { v4.b }[15], [x10] ; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x10, sp, #72 ; CHECK-NEXT: ld1 { v6.b }[14], [x8] ; CHECK-NEXT: add x8, sp, #952 ; CHECK-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-NEXT: ld1 { v1.b }[13], [x8] -; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: ld1 { v5.b }[13], [x8] ; CHECK-NEXT: add x8, sp, #840 -; CHECK-NEXT: sdot v3.4s, v5.16b, v2.16b ; CHECK-NEXT: ld1 { v0.b }[15], [x10] +; CHECK-NEXT: sdot v2.4s, v4.16b, v1.16b ; CHECK-NEXT: add x9, sp, #192 ; CHECK-NEXT: ld1 { v6.b }[15], [x8] ; CHECK-NEXT: add x8, sp, #960 ; CHECK-NEXT: ld1 { v7.b }[14], [x9] -; CHECK-NEXT: ld1 { v1.b }[14], [x8] +; CHECK-NEXT: ld1 { v5.b }[14], [x8] +; CHECK-NEXT: sdot v3.4s, v0.16b, v1.16b ; CHECK-NEXT: add x8, sp, #200 ; CHECK-NEXT: add x9, sp, #968 -; CHECK-NEXT: sdot v4.4s, v0.16b, v2.16b -; CHECK-NEXT: sdot v3.4s, v6.16b, v2.16b +; CHECK-NEXT: sdot v2.4s, v6.16b, v1.16b ; CHECK-NEXT: ld1 { v7.b }[15], [x8] -; CHECK-NEXT: ld1 { v1.b }[15], [x9] -; CHECK-NEXT: sdot v4.4s, v7.16b, v2.16b -; CHECK-NEXT: sdot v3.4s, v1.16b, v2.16b -; CHECK-NEXT: add v0.4s, v4.4s, v3.4s +; CHECK-NEXT: ld1 { v5.b }[15], [x9] +; CHECK-NEXT: sdot v3.4s, v7.16b, v1.16b +; CHECK-NEXT: sdot v2.4s, v5.16b, v1.16b +; CHECK-NEXT: add v0.4s, v3.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 6d2305059ce88..913205f327536 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -459,48 +459,48 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-NEXT: mov v0.b[1], w1 ; CHECK-NEXT: ld1 { v3.b }[1], [x11] ; CHECK-NEXT: ld1 { v1.b }[1], [x9] -; CHECK-NEXT: add x12, sp, #16 +; CHECK-NEXT: add x11, sp, #16 ; CHECK-NEXT: add x9, sp, #112 ; CHECK-NEXT: add x13, sp, #184 ; CHECK-NEXT: ld1 { v2.b }[2], [x10] -; CHECK-NEXT: add x11, sp, #120 +; CHECK-NEXT: add x12, sp, #120 ; CHECK-NEXT: add x14, sp, #32 -; CHECK-NEXT: ld1 { v3.b }[2], [x12] +; CHECK-NEXT: ld1 { v3.b }[2], [x11] ; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: ldr b5, [sp, #64] ; CHECK-NEXT: mov v0.b[2], w2 ; CHECK-NEXT: ldr b4, [sp, #224] -; CHECK-NEXT: add x10, sp, #128 +; CHECK-NEXT: add x11, sp, #128 ; CHECK-NEXT: ld1 { v2.b }[3], [x13] ; CHECK-NEXT: add x13, sp, #24 -; CHECK-NEXT: add x12, sp, #136 +; CHECK-NEXT: add x10, sp, #136 ; CHECK-NEXT: ld1 { v3.b }[3], [x13] -; CHECK-NEXT: ld1 { v1.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #192 +; CHECK-NEXT: ld1 { v1.b }[3], [x12] +; CHECK-NEXT: add x12, sp, #192 ; CHECK-NEXT: add x13, sp, #200 ; CHECK-NEXT: add x15, sp, #80 ; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v2.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #232 +; CHECK-NEXT: ld1 { v2.b }[4], [x12] +; CHECK-NEXT: add x12, sp, #232 ; CHECK-NEXT: ld1 { v3.b }[4], [x14] ; CHECK-NEXT: add x14, sp, #72 -; CHECK-NEXT: ld1 { v4.b }[1], [x11] +; CHECK-NEXT: ld1 { v4.b }[1], [x12] ; CHECK-NEXT: ld1 { v5.b }[1], [x14] ; CHECK-NEXT: add x14, sp, #40 -; CHECK-NEXT: ld1 { v1.b }[4], [x10] +; CHECK-NEXT: ld1 { v1.b }[4], [x11] ; CHECK-NEXT: ld1 { v2.b }[5], [x13] -; CHECK-NEXT: add x11, sp, #208 +; CHECK-NEXT: add x12, sp, #208 ; CHECK-NEXT: add x13, sp, #48 ; CHECK-NEXT: mov v0.b[4], w4 ; CHECK-NEXT: ld1 { v3.b }[5], [x14] ; CHECK-NEXT: add x14, sp, #240 ; CHECK-NEXT: ld1 { v4.b }[2], [x14] ; CHECK-NEXT: ld1 { v5.b }[2], [x15] -; CHECK-NEXT: ld1 { v1.b }[5], [x12] -; CHECK-NEXT: ld1 { v2.b }[6], [x11] -; CHECK-NEXT: add x10, sp, #216 -; CHECK-NEXT: add x11, sp, #56 +; CHECK-NEXT: ld1 { v1.b }[5], [x10] +; CHECK-NEXT: ld1 { v2.b }[6], [x12] +; CHECK-NEXT: add x11, sp, #216 +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: ld1 { v3.b }[6], [x13] ; CHECK-NEXT: add x12, sp, #248 ; CHECK-NEXT: add x13, sp, #88 @@ -508,9 +508,9 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-NEXT: ld1 { v4.b }[3], [x12] ; CHECK-NEXT: ld1 { v5.b }[3], [x13] ; CHECK-NEXT: ld1 { v1.b }[6], [x9] -; CHECK-NEXT: ld1 { v2.b }[7], [x10] +; CHECK-NEXT: ld1 { v2.b }[7], [x11] ; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v3.b }[7], [x11] +; CHECK-NEXT: ld1 { v3.b }[7], [x10] ; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b ; CHECK-NEXT: mov v0.b[6], w6 ; CHECK-NEXT: ld1 { v1.b }[7], [x9] @@ -545,23 +545,23 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w23, -48 -; CHECK-NEXT: ldr w12, [sp, #112] +; CHECK-NEXT: ldr w13, [sp, #112] ; CHECK-NEXT: ldr w14, [sp, #144] ; CHECK-NEXT: fmov s2, w4 -; CHECK-NEXT: ldr w16, [sp, #176] +; CHECK-NEXT: ldr w17, [sp, #176] ; CHECK-NEXT: ldr w19, [sp, #208] ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: ldr w20, [sp, #80] ; CHECK-NEXT: ldr w21, [sp, #48] -; CHECK-NEXT: fmov s5, w12 +; CHECK-NEXT: fmov s5, w13 ; CHECK-NEXT: fmov s4, w19 -; CHECK-NEXT: fmov s6, w16 +; CHECK-NEXT: fmov s6, w17 ; CHECK-NEXT: fmov s7, w14 ; CHECK-NEXT: fmov s0, w20 ; CHECK-NEXT: fmov s1, w21 ; CHECK-NEXT: ldr w10, [sp, #120] ; CHECK-NEXT: ldr w11, [sp, #152] -; CHECK-NEXT: ldr w13, [sp, #184] +; CHECK-NEXT: ldr w12, [sp, #184] ; CHECK-NEXT: ldr w15, [sp, #216] ; CHECK-NEXT: ldr w22, [sp, #88] ; CHECK-NEXT: ldr w23, [sp, #56] @@ -571,18 +571,18 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { ; CHECK-NEXT: mov v4.h[1], w15 ; CHECK-NEXT: mov v0.h[1], w22 ; CHECK-NEXT: mov v1.h[1], w23 -; CHECK-NEXT: mov v6.h[1], w13 +; CHECK-NEXT: mov v6.h[1], w12 ; CHECK-NEXT: mov v7.h[1], w11 ; CHECK-NEXT: ldr w8, [sp, #128] ; CHECK-NEXT: ldr w9, [sp, #160] -; CHECK-NEXT: ldr w17, [sp, #64] +; CHECK-NEXT: ldr w16, [sp, #64] ; CHECK-NEXT: ldr w18, [sp, #96] ; CHECK-NEXT: ldr w10, [sp, #192] ; CHECK-NEXT: ldr w11, [sp, #224] ; CHECK-NEXT: mov v2.h[2], w6 ; CHECK-NEXT: mov v3.h[2], w2 ; CHECK-NEXT: mov v0.h[2], w18 -; CHECK-NEXT: mov v1.h[2], w17 +; CHECK-NEXT: mov v1.h[2], w16 ; CHECK-NEXT: mov v5.h[2], w8 ; CHECK-NEXT: mov v4.h[2], w11 ; CHECK-NEXT: mov v6.h[2], w10 diff --git a/llvm/test/CodeGen/AArch64/pow.ll b/llvm/test/CodeGen/AArch64/pow.ll index 623429c1085ac..ba9a5398298af 100644 --- a/llvm/test/CodeGen/AArch64/pow.ll +++ b/llvm/test/CodeGen/AArch64/pow.ll @@ -110,17 +110,17 @@ define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwi ; CHECK-LABEL: pow_v2f64_one_fourth_not_enough_fmf: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov d0, v0.d[1] ; CHECK-NEXT: fmov d1, #0.25000000 ; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: bl pow ; CHECK-NEXT: fmov d1, #0.25000000 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl pow -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], v1.d[0] diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index 43f40badc1ae2..419f25c22eb72 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -29,22 +29,24 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: .cfi_offset b14, -88 ; CHECK-NEXT: .cfi_offset b15, -96 ; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: // implicit-def: $q6 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: adrp x10, B+48 ; CHECK-NEXT: add x10, x10, :lo12:B+48 ; CHECK-NEXT: adrp x11, A ; CHECK-NEXT: add x11, x11, :lo12:A +; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: // implicit-def: $q6 -; CHECK-NEXT: // implicit-def: $q7 -; CHECK-NEXT: // implicit-def: $q10 +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 ; CHECK-NEXT: // implicit-def: $q4 ; CHECK-NEXT: // implicit-def: $q5 -; CHECK-NEXT: // implicit-def: $q2 +; CHECK-NEXT: // implicit-def: $q7 ; CHECK-NEXT: // implicit-def: $q16 ; CHECK-NEXT: // implicit-def: $q17 -; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // implicit-def: $q10 ; CHECK-NEXT: // implicit-def: $q19 ; CHECK-NEXT: // implicit-def: $q20 ; CHECK-NEXT: // implicit-def: $q21 @@ -56,179 +58,170 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-NEXT: // implicit-def: $q26 ; CHECK-NEXT: // implicit-def: $q28 ; CHECK-NEXT: // implicit-def: $q30 -; CHECK-NEXT: // implicit-def: $q15 +; CHECK-NEXT: // implicit-def: $q18 ; CHECK-NEXT: // implicit-def: $q29 ; CHECK-NEXT: // implicit-def: $q31 -; CHECK-NEXT: // implicit-def: $q11 -; CHECK-NEXT: // implicit-def: $q9 -; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: // implicit-def: $q12 ; CHECK-NEXT: // implicit-def: $q13 +; CHECK-NEXT: // implicit-def: $q11 +; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: // implicit-def: $q6 ; CHECK-NEXT: // kill: killed $q6 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr q14, [x8] ; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: str q18, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: ldr x14, [x12] +; CHECK-NEXT: ldr q15, [x12] ; CHECK-NEXT: add x7, x11, x8 -; CHECK-NEXT: ldr x13, [x12] +; CHECK-NEXT: fmov x15, d14 +; CHECK-NEXT: mov x16, v14.d[1] +; CHECK-NEXT: ldr q18, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: fmov x18, d15 +; CHECK-NEXT: mov x13, v15.d[1] ; CHECK-NEXT: ldr x5, [x8] -; CHECK-NEXT: ldr x7, [x7, #128] -; CHECK-NEXT: mov x14, v14.d[1] -; CHECK-NEXT: stp q22, q26, [sp] // 32-byte Folded Spill -; CHECK-NEXT: mov v22.16b, v9.16b -; CHECK-NEXT: stp q31, q15, [sp, #32] // 32-byte Folded Spill -; CHECK-NEXT: ldr q15, [x12] -; CHECK-NEXT: fmov x12, d14 ; CHECK-NEXT: ldr q14, [x10], #64 -; CHECK-NEXT: mov v9.16b, v30.16b -; CHECK-NEXT: fmov x17, d15 -; CHECK-NEXT: mov x16, v15.d[1] -; CHECK-NEXT: mov v30.16b, v27.16b -; CHECK-NEXT: mul x15, x12, x13 -; CHECK-NEXT: mov x0, v14.d[1] +; CHECK-NEXT: ldr x7, [x7, #128] +; CHECK-NEXT: mul x17, x15, x14 +; CHECK-NEXT: mov v6.16b, v0.16b +; CHECK-NEXT: mov v9.16b, v27.16b +; CHECK-NEXT: mov x12, v14.d[1] ; CHECK-NEXT: fmov x4, d14 ; CHECK-NEXT: mov v27.16b, v23.16b +; CHECK-NEXT: mul x1, x16, x14 ; CHECK-NEXT: mov v23.16b, v19.16b -; CHECK-NEXT: mov v19.16b, v2.16b -; CHECK-NEXT: mul x1, x14, x13 -; CHECK-NEXT: mov v8.16b, v28.16b -; CHECK-NEXT: mov v28.16b, v24.16b -; CHECK-NEXT: mov v24.16b, v20.16b +; CHECK-NEXT: mov v19.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v2.16b +; CHECK-NEXT: stp q26, q31, [sp] // 32-byte Folded Spill +; CHECK-NEXT: mov v31.16b, v22.16b +; CHECK-NEXT: mul x0, x18, x14 +; CHECK-NEXT: mov v26.16b, v10.16b +; CHECK-NEXT: mov v22.16b, v5.16b +; CHECK-NEXT: fmov d15, x17 +; CHECK-NEXT: mov v5.16b, v1.16b +; CHECK-NEXT: mov v8.16b, v20.16b +; CHECK-NEXT: mul x2, x13, x14 ; CHECK-NEXT: mov v20.16b, v16.16b ; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: mul x18, x17, x13 -; CHECK-NEXT: mov v31.16b, v18.16b -; CHECK-NEXT: mov v26.16b, v5.16b -; CHECK-NEXT: fmov d15, x15 -; CHECK-NEXT: mov v5.16b, v1.16b -; CHECK-NEXT: mov v18.16b, v10.16b -; CHECK-NEXT: mul x2, x16, x13 -; CHECK-NEXT: mov v10.16b, v29.16b -; CHECK-NEXT: mov v29.16b, v25.16b -; CHECK-NEXT: mov v25.16b, v21.16b +; CHECK-NEXT: mov v10.16b, v21.16b ; CHECK-NEXT: mov v21.16b, v17.16b ; CHECK-NEXT: mov v17.16b, v4.16b ; CHECK-NEXT: mov v15.d[1], x1 -; CHECK-NEXT: mul x19, x12, x5 +; CHECK-NEXT: mul x3, x12, x14 ; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: fmov d14, x18 +; CHECK-NEXT: fmov d14, x0 ; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: add x9, x9, #1 -; CHECK-NEXT: mul x12, x12, x7 +; CHECK-NEXT: mul x14, x4, x14 +; CHECK-NEXT: add v18.2d, v18.2d, v15.2d +; CHECK-NEXT: mul x19, x15, x5 ; CHECK-NEXT: mov v14.d[1], x2 -; CHECK-NEXT: add v12.2d, v12.2d, v15.2d -; CHECK-NEXT: mul x3, x0, x13 +; CHECK-NEXT: mul x15, x15, x7 +; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: str q18, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: ldp q18, q15, [sp, #32] // 32-byte Folded Reload +; CHECK-NEXT: mul x6, x16, x5 ; CHECK-NEXT: fmov d1, x19 -; CHECK-NEXT: mul x13, x4, x13 -; CHECK-NEXT: fmov d2, x12 -; CHECK-NEXT: mul x6, x14, x5 -; CHECK-NEXT: add v6.2d, v13.2d, v14.2d -; CHECK-NEXT: mov v13.16b, v12.16b -; CHECK-NEXT: ldr q12, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: mul x14, x14, x7 -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: add v12.2d, v12.2d, v14.2d -; CHECK-NEXT: mul x21, x17, x7 -; CHECK-NEXT: mov v1.d[1], x6 -; CHECK-NEXT: mul x18, x4, x7 ; CHECK-NEXT: mov v0.d[1], x3 -; CHECK-NEXT: mov v2.d[1], x14 -; CHECK-NEXT: str q12, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: mov v12.16b, v13.16b -; CHECK-NEXT: mul x13, x17, x5 -; CHECK-NEXT: mov v13.16b, v6.16b +; CHECK-NEXT: mul x16, x16, x7 +; CHECK-NEXT: fmov d2, x15 +; CHECK-NEXT: add v15.2d, v15.2d, v14.2d +; CHECK-NEXT: mul x21, x18, x7 +; CHECK-NEXT: mov v1.d[1], x6 +; CHECK-NEXT: mul x0, x4, x7 +; CHECK-NEXT: str q15, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: add v15.2d, v11.2d, v14.2d +; CHECK-NEXT: mov v2.d[1], x16 +; CHECK-NEXT: ldr q11, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: mul x20, x13, x7 ; CHECK-NEXT: fmov d3, x21 -; CHECK-NEXT: ldp q15, q6, [sp, #48] // 32-byte Folded Reload -; CHECK-NEXT: mul x20, x16, x7 -; CHECK-NEXT: add v11.2d, v11.2d, v1.2d -; CHECK-NEXT: fmov d4, x18 -; CHECK-NEXT: mul x22, x0, x7 -; CHECK-NEXT: add v6.2d, v6.2d, v0.2d -; CHECK-NEXT: add v15.2d, v15.2d, v2.2d -; CHECK-NEXT: fmov d14, x13 -; CHECK-NEXT: mov v2.16b, v19.16b +; CHECK-NEXT: add v11.2d, v11.2d, v0.2d +; CHECK-NEXT: add v12.2d, v12.2d, v1.2d +; CHECK-NEXT: mul x22, x12, x7 +; CHECK-NEXT: fmov d4, x0 +; CHECK-NEXT: add v18.2d, v18.2d, v2.2d +; CHECK-NEXT: mov v2.16b, v7.16b +; CHECK-NEXT: mul x14, x18, x5 +; CHECK-NEXT: mov v7.16b, v19.16b ; CHECK-NEXT: mov v19.16b, v23.16b -; CHECK-NEXT: mul x14, x4, x5 -; CHECK-NEXT: mov v23.16b, v27.16b -; CHECK-NEXT: mov v27.16b, v30.16b ; CHECK-NEXT: mov v3.d[1], x20 -; CHECK-NEXT: mov v30.16b, v9.16b -; CHECK-NEXT: mov v9.16b, v22.16b -; CHECK-NEXT: mul x12, x16, x5 -; CHECK-NEXT: str q6, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: mov v6.16b, v18.16b +; CHECK-NEXT: mov v23.16b, v27.16b +; CHECK-NEXT: mov v27.16b, v9.16b +; CHECK-NEXT: mul x15, x4, x5 +; CHECK-NEXT: add v27.2d, v9.2d, v1.2d +; CHECK-NEXT: str q11, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov v4.d[1], x22 -; CHECK-NEXT: add v27.2d, v27.2d, v1.2d -; CHECK-NEXT: add v23.2d, v23.2d, v1.2d -; CHECK-NEXT: mul x13, x0, x5 ; CHECK-NEXT: add v19.2d, v19.2d, v1.2d -; CHECK-NEXT: add v2.2d, v2.2d, v1.2d -; CHECK-NEXT: fmov d0, x14 +; CHECK-NEXT: add v7.2d, v7.2d, v1.2d +; CHECK-NEXT: mul x13, x13, x5 +; CHECK-NEXT: add v23.2d, v23.2d, v1.2d +; CHECK-NEXT: add v1.2d, v5.2d, v1.2d +; CHECK-NEXT: fmov d14, x14 ; CHECK-NEXT: add v30.2d, v30.2d, v3.2d ; CHECK-NEXT: mov v3.16b, v16.16b +; CHECK-NEXT: mul x12, x12, x5 ; CHECK-NEXT: mov v16.16b, v20.16b -; CHECK-NEXT: mov v20.16b, v24.16b -; CHECK-NEXT: mov v24.16b, v28.16b -; CHECK-NEXT: mov v14.d[1], x12 -; CHECK-NEXT: mov v28.16b, v8.16b -; CHECK-NEXT: add v1.2d, v5.2d, v1.2d -; CHECK-NEXT: add v28.2d, v8.2d, v4.2d +; CHECK-NEXT: mov v5.16b, v22.16b +; CHECK-NEXT: fmov d0, x15 +; CHECK-NEXT: add v28.2d, v28.2d, v4.2d ; CHECK-NEXT: mov v4.16b, v17.16b ; CHECK-NEXT: mov v17.16b, v21.16b -; CHECK-NEXT: mov v0.d[1], x13 -; CHECK-NEXT: mov v21.16b, v25.16b -; CHECK-NEXT: mov v25.16b, v29.16b -; CHECK-NEXT: mov v29.16b, v10.16b -; CHECK-NEXT: mov v5.16b, v26.16b -; CHECK-NEXT: mov v18.16b, v31.16b -; CHECK-NEXT: ldp q22, q26, [sp] // 32-byte Folded Reload -; CHECK-NEXT: ldr q31, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: add v9.2d, v9.2d, v14.2d -; CHECK-NEXT: add v24.2d, v24.2d, v14.2d -; CHECK-NEXT: add v20.2d, v20.2d, v14.2d +; CHECK-NEXT: mov v21.16b, v10.16b +; CHECK-NEXT: mov v10.16b, v26.16b +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: mov v22.16b, v31.16b +; CHECK-NEXT: mov v20.16b, v8.16b +; CHECK-NEXT: ldp q26, q31, [sp] // 32-byte Folded Reload +; CHECK-NEXT: mov v11.16b, v15.16b +; CHECK-NEXT: mov v0.d[1], x12 +; CHECK-NEXT: add v13.2d, v13.2d, v14.2d ; CHECK-NEXT: add v31.2d, v31.2d, v14.2d -; CHECK-NEXT: add v18.2d, v18.2d, v14.2d -; CHECK-NEXT: add v16.2d, v16.2d, v14.2d ; CHECK-NEXT: add v26.2d, v26.2d, v14.2d +; CHECK-NEXT: add v24.2d, v24.2d, v14.2d ; CHECK-NEXT: add v22.2d, v22.2d, v14.2d +; CHECK-NEXT: add v20.2d, v8.2d, v14.2d +; CHECK-NEXT: add v10.2d, v10.2d, v14.2d +; CHECK-NEXT: add v16.2d, v16.2d, v14.2d ; CHECK-NEXT: add v5.2d, v5.2d, v14.2d ; CHECK-NEXT: add v3.2d, v3.2d, v14.2d -; CHECK-NEXT: add v10.2d, v6.2d, v14.2d +; CHECK-NEXT: add v2.2d, v2.2d, v14.2d ; CHECK-NEXT: add v29.2d, v29.2d, v0.2d ; CHECK-NEXT: add v25.2d, v25.2d, v0.2d ; CHECK-NEXT: add v21.2d, v21.2d, v0.2d ; CHECK-NEXT: add v17.2d, v17.2d, v0.2d ; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v7.2d, v7.2d, v0.2d +; CHECK-NEXT: add v0.2d, v6.2d, v0.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ldr q6, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: stp q13, q12, [x8] -; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: stp q9, q11, [x8, #64] +; CHECK-NEXT: stp q12, q31, [x8, #80] ; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload -; CHECK-NEXT: stp q15, q30, [x8, #144] +; CHECK-NEXT: str q6, [x8] +; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: str q29, [x8, #112] ; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload -; CHECK-NEXT: stp q4, q3, [x8, #432] +; CHECK-NEXT: stp q6, q11, [x8, #16] +; CHECK-NEXT: ldr q6, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: stp q18, q30, [x8, #144] ; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload -; CHECK-NEXT: stp q0, q6, [x8, #32] +; CHECK-NEXT: stp q6, q13, [x8, #48] ; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: stp q31, q29, [x8, #96] -; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: stp q28, q26, [x8, #176] +; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: stp q19, q10, [x8, #336] +; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload ; CHECK-NEXT: str q27, [x8, #208] ; CHECK-NEXT: stp q25, q24, [x8, #240] ; CHECK-NEXT: stp q23, q22, [x8, #272] ; CHECK-NEXT: stp q21, q20, [x8, #304] -; CHECK-NEXT: stp q19, q18, [x8, #336] ; CHECK-NEXT: stp q17, q16, [x8, #368] -; CHECK-NEXT: stp q2, q5, [x8, #400] -; CHECK-NEXT: stp q1, q10, [x8, #464] -; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload -; CHECK-NEXT: str q7, [x8, #496] +; CHECK-NEXT: stp q7, q5, [x8, #400] +; CHECK-NEXT: stp q4, q3, [x8, #432] +; CHECK-NEXT: stp q1, q2, [x8, #464] +; CHECK-NEXT: str q0, [x8, #496] ; CHECK-NEXT: add sp, sp, #192 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll new file mode 100644 index 0000000000000..47906252382f4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll @@ -0,0 +1,15 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sme -pass-remarks-missed='regalloc' %s -o - 2>&1 | FileCheck %s + +; We should have both spill and reload for %arg. + +; CHECK: remark: :0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost generated in function + +define @streaming_compatible_with_predicate_vectors( %arg) "aarch64_pstate_sm_compatible" nounwind #0 { + %res = call @normal_callee_predicate_vec_arg( %arg) + %and = and %res, %arg + ret %and +} + +declare @normal_callee_predicate_vec_arg() + +attributes #0 = { nounwind "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index 5f7a22ed055c8..70987df1c9c04 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -333,21 +333,21 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: punpklo p2.h, p0.b -; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d] -; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z4.d] ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: ret ; @@ -711,21 +711,21 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: punpklo p2.h, p0.b -; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p1.b ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z4.d] -; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 +; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z4.d] ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index 6e29f7cbabcc8..a5303c901b80f 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -33,75 +33,75 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: umov w8, v0.b[8] ; CHECK-NEXT: umov w9, v0.b[9] ; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: umov w11, v0.b[15] -; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: umov w8, v0.b[10] -; CHECK-NEXT: mov v2.b[1], w10 +; CHECK-NEXT: mov v1.b[1], w10 ; CHECK-NEXT: umov w10, v0.b[11] -; CHECK-NEXT: mov v1.b[1], w9 +; CHECK-NEXT: mov v2.b[1], w9 ; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: mov v1.b[2], w8 +; CHECK-NEXT: mov v2.b[2], w8 ; CHECK-NEXT: umov w8, v0.b[3] -; CHECK-NEXT: mov v2.b[2], w9 +; CHECK-NEXT: mov v1.b[2], w9 ; CHECK-NEXT: umov w9, v0.b[12] -; CHECK-NEXT: mov v1.b[3], w10 +; CHECK-NEXT: mov v2.b[3], w10 ; CHECK-NEXT: umov w10, v0.b[4] -; CHECK-NEXT: mov v2.b[3], w8 +; CHECK-NEXT: mov v1.b[3], w8 ; CHECK-NEXT: umov w8, v0.b[13] -; CHECK-NEXT: mov v1.b[4], w9 +; CHECK-NEXT: mov v2.b[4], w9 ; CHECK-NEXT: umov w9, v0.b[5] -; CHECK-NEXT: mov v2.b[4], w10 +; CHECK-NEXT: mov v1.b[4], w10 ; CHECK-NEXT: umov w10, v0.b[14] -; CHECK-NEXT: mov v1.b[5], w8 +; CHECK-NEXT: mov v2.b[5], w8 ; CHECK-NEXT: umov w8, v0.b[6] -; CHECK-NEXT: mov v2.b[5], w9 +; CHECK-NEXT: mov v1.b[5], w9 ; CHECK-NEXT: umov w9, v0.b[7] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 -; CHECK-NEXT: mov v1.b[6], w10 -; CHECK-NEXT: mov v2.b[6], w8 +; CHECK-NEXT: mov v2.b[6], w10 +; CHECK-NEXT: mov v1.b[6], w8 ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: mov x10, #8 // =0x8 ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: mov v1.b[7], w11 -; CHECK-NEXT: mov v2.b[7], w9 +; CHECK-NEXT: mov v2.b[7], w11 +; CHECK-NEXT: mov v1.b[7], w9 ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: mov x9, #24 // =0x18 -; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: lsl z3.s, z3.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: lsl z2.s, z2.s, #31 +; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: and z0.s, z0.s, #0x1 ; CHECK-NEXT: and z3.s, z3.s, #0x1 -; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: asr z2.s, z2.s, #31 +; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] -; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 -; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0 +; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, #0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 ; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] ; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 -; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0 +; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0 ; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] -; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: st1w { z2.s }, p0, [x0] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index 756e25f8e3368..d6adf9cf0ad67 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -215,44 +215,44 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #128 ; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z3.h, z0.h[2] -; CHECK-NEXT: fcvtzu x8, h0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: mov z5.h, z4.h[3] -; CHECK-NEXT: fcvtzu x10, h4 -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: fcvtzu x11, h2 -; CHECK-NEXT: fcvtzu x12, h3 -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzu x13, h0 -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: mov z2.h, z4.h[1] -; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z3.h, z1.h[3] +; CHECK-NEXT: mov z4.h, z1.h[2] ; CHECK-NEXT: fcvtzu x8, h1 -; CHECK-NEXT: fcvtzu x9, h3 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: fcvtzu x10, h0 +; CHECK-NEXT: fcvtzu x9, h2 +; CHECK-NEXT: fcvtzu x11, h3 +; CHECK-NEXT: fcvtzu x12, h4 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z4.h, z1.h[3] +; CHECK-NEXT: fcvtzu x13, h1 +; CHECK-NEXT: mov z1.h, z1.h[2] +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: fcvtzu x8, h2 +; CHECK-NEXT: fcvtzu x9, h4 ; CHECK-NEXT: stp x12, x11, [sp, #48] -; CHECK-NEXT: fcvtzu x11, h0 -; CHECK-NEXT: mov z1.h, z4.h[2] -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: fcvtzu x12, h2 +; CHECK-NEXT: fcvtzu x11, h1 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: fcvtzu x12, h3 ; CHECK-NEXT: stp x13, x8, [sp] ; CHECK-NEXT: fcvtzu x8, h5 ; CHECK-NEXT: stp x11, x9, [sp, #16] -; CHECK-NEXT: fcvtzu x9, h1 -; CHECK-NEXT: mov z0.h, z4.h[1] -; CHECK-NEXT: mov z1.h, z4.h[3] -; CHECK-NEXT: mov z2.h, z4.h[2] -; CHECK-NEXT: fcvtzu x11, h4 +; CHECK-NEXT: fcvtzu x9, h2 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: fcvtzu x11, h0 +; CHECK-NEXT: mov z0.h, z0.h[2] ; CHECK-NEXT: stp x10, x12, [sp, #96] ; CHECK-NEXT: ldp q3, q4, [sp] -; CHECK-NEXT: fcvtzu x10, h0 -; CHECK-NEXT: fcvtzu x12, h1 +; CHECK-NEXT: fcvtzu x10, h1 +; CHECK-NEXT: fcvtzu x12, h2 ; CHECK-NEXT: stp x9, x8, [sp, #112] -; CHECK-NEXT: fcvtzu x8, h2 +; CHECK-NEXT: fcvtzu x8, h0 ; CHECK-NEXT: ldp q0, q1, [sp, #32] ; CHECK-NEXT: ldp q6, q7, [sp, #96] ; CHECK-NEXT: stp x11, x10, [sp, #64] @@ -965,44 +965,44 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #128 ; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z2.h, z0.h[3] -; CHECK-NEXT: mov z3.h, z0.h[2] -; CHECK-NEXT: fcvtzs x8, h0 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: mov z5.h, z4.h[3] -; CHECK-NEXT: fcvtzs x10, h4 -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: fcvtzs x11, h2 -; CHECK-NEXT: fcvtzs x12, h3 -; CHECK-NEXT: mov z1.h, z0.h[1] -; CHECK-NEXT: mov z3.h, z0.h[3] -; CHECK-NEXT: fcvtzs x13, h0 -; CHECK-NEXT: mov z0.h, z0.h[2] -; CHECK-NEXT: mov z2.h, z4.h[1] -; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z3.h, z1.h[3] +; CHECK-NEXT: mov z4.h, z1.h[2] ; CHECK-NEXT: fcvtzs x8, h1 -; CHECK-NEXT: fcvtzs x9, h3 +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: fcvtzs x10, h0 +; CHECK-NEXT: fcvtzs x9, h2 +; CHECK-NEXT: fcvtzs x11, h3 +; CHECK-NEXT: fcvtzs x12, h4 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: mov z4.h, z1.h[3] +; CHECK-NEXT: fcvtzs x13, h1 +; CHECK-NEXT: mov z1.h, z1.h[2] +; CHECK-NEXT: mov z3.h, z0.h[1] +; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: fcvtzs x8, h2 +; CHECK-NEXT: fcvtzs x9, h4 ; CHECK-NEXT: stp x12, x11, [sp, #48] -; CHECK-NEXT: fcvtzs x11, h0 -; CHECK-NEXT: mov z1.h, z4.h[2] -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: fcvtzs x12, h2 +; CHECK-NEXT: fcvtzs x11, h1 +; CHECK-NEXT: mov z2.h, z0.h[2] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: fcvtzs x12, h3 ; CHECK-NEXT: stp x13, x8, [sp] ; CHECK-NEXT: fcvtzs x8, h5 ; CHECK-NEXT: stp x11, x9, [sp, #16] -; CHECK-NEXT: fcvtzs x9, h1 -; CHECK-NEXT: mov z0.h, z4.h[1] -; CHECK-NEXT: mov z1.h, z4.h[3] -; CHECK-NEXT: mov z2.h, z4.h[2] -; CHECK-NEXT: fcvtzs x11, h4 +; CHECK-NEXT: fcvtzs x9, h2 +; CHECK-NEXT: mov z1.h, z0.h[1] +; CHECK-NEXT: mov z2.h, z0.h[3] +; CHECK-NEXT: fcvtzs x11, h0 +; CHECK-NEXT: mov z0.h, z0.h[2] ; CHECK-NEXT: stp x10, x12, [sp, #96] ; CHECK-NEXT: ldp q3, q4, [sp] -; CHECK-NEXT: fcvtzs x10, h0 -; CHECK-NEXT: fcvtzs x12, h1 +; CHECK-NEXT: fcvtzs x10, h1 +; CHECK-NEXT: fcvtzs x12, h2 ; CHECK-NEXT: stp x9, x8, [sp, #112] -; CHECK-NEXT: fcvtzs x8, h2 +; CHECK-NEXT: fcvtzs x8, h0 ; CHECK-NEXT: ldp q0, q1, [sp, #32] ; CHECK-NEXT: ldp q6, q7, [sp, #96] ; CHECK-NEXT: stp x11, x10, [sp, #64] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index 63a4226655a88..5e3ce0ddebe2e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -112,11 +112,10 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mov z16.d, z6.d ; CHECK-NEXT: mov z0.d, z3.d ; CHECK-NEXT: ext z1.b, z1.b, z2.b, #8 -; CHECK-NEXT: sunpklo z2.h, z2.b ; CHECK-NEXT: ext z16.b, z16.b, z6.b, #8 +; CHECK-NEXT: sunpklo z6.h, z6.b ; CHECK-NEXT: ext z0.b, z0.b, z3.b, #8 ; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z6.h, z6.b ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z16.h, z16.b ; CHECK-NEXT: sunpklo z4.h, z0.b @@ -130,15 +129,16 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sunpklo z16.s, z16.h ; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s -; CHECK-NEXT: sunpklo z5.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: sunpklo z4.s, z3.h +; CHECK-NEXT: sunpklo z4.h, z2.b +; CHECK-NEXT: sunpklo z2.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z5.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s ; CHECK-NEXT: mov z5.d, z7.d ; CHECK-NEXT: ext z5.b, z5.b, z7.b, #8 ; CHECK-NEXT: sunpklo z7.h, z7.b @@ -146,37 +146,37 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z17.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: sunpklo z18.s, z6.h ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: sunpklo z6.s, z6.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z16.s ; CHECK-NEXT: sunpklo z16.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: uzp1 z3.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z4.h, z17.h, z17.h ; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z5.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b +; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z2.b, z7.b, z7.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z0.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b -; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b +; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -479,11 +479,10 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: mov z16.d, z6.d ; CHECK-NEXT: mov z0.d, z3.d ; CHECK-NEXT: ext z1.b, z1.b, z2.b, #8 -; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: ext z16.b, z16.b, z6.b, #8 +; CHECK-NEXT: uunpklo z6.h, z6.b ; CHECK-NEXT: ext z0.b, z0.b, z3.b, #8 ; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z6.h, z6.b ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z16.h, z16.b ; CHECK-NEXT: uunpklo z4.h, z0.b @@ -497,15 +496,16 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: uunpklo z16.s, z16.h ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s -; CHECK-NEXT: uunpklo z5.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: uunpklo z4.s, z3.h +; CHECK-NEXT: uunpklo z4.h, z2.b +; CHECK-NEXT: uunpklo z2.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z5.s, z4.h +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s ; CHECK-NEXT: mov z5.d, z7.d ; CHECK-NEXT: ext z5.b, z5.b, z7.b, #8 ; CHECK-NEXT: uunpklo z7.h, z7.b @@ -513,37 +513,37 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z17.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 -; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: uunpklo z18.s, z6.h ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 ; CHECK-NEXT: uunpklo z6.s, z6.h -; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z16.s ; CHECK-NEXT: uunpklo z16.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 ; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: uzp1 z3.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z4.h, z17.h, z17.h ; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h -; CHECK-NEXT: splice z3.h, p0, z3.h, z5.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h -; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b +; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z2.b, z7.b, z7.b -; CHECK-NEXT: splice z3.b, p0, z3.b, z0.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b -; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b +; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 2c3303d5d3407..eb95a410209b4 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -115,19 +115,19 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: sunpklo z6.h, z1.b -; CHECK-NEXT: sunpklo z7.h, z0.b +; CHECK-NEXT: sunpklo z7.h, z1.b +; CHECK-NEXT: sunpklo z16.h, z0.b ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: sunpklo z16.s, z6.h -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: sunpklo z17.s, z7.h +; CHECK-NEXT: sunpklo z6.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: sunpklo z17.s, z16.h +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 ; CHECK-NEXT: sunpklo z4.h, z2.b ; CHECK-NEXT: sunpklo z3.h, z3.b -; CHECK-NEXT: sunpklo z6.s, z6.h ; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: sunpklo z16.s, z16.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z17.s ; CHECK-NEXT: sunpklo z2.s, z4.h ; CHECK-NEXT: sunpklo z5.s, z3.h ; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 @@ -135,34 +135,35 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: ldr q5, [x1] -; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: ext z17.b, z17.b, z5.b, #8 +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z4.s +; CHECK-NEXT: ldr q3, [x0] +; CHECK-NEXT: ldr q4, [x1] +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z17.d, z4.d +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z4.b, #8 +; CHECK-NEXT: sunpklo z18.h, z18.b ; CHECK-NEXT: sunpklo z17.h, z17.b -; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: ldr q4, [x0] +; CHECK-NEXT: sunpklo z20.s, z18.h +; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: sunpklo z19.s, z17.h ; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 -; CHECK-NEXT: mov z18.d, z4.d +; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: sunpklo z18.s, z18.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sunpklo z17.s, z17.h -; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 -; CHECK-NEXT: sunpklo z18.h, z18.b -; CHECK-NEXT: sunpklo z20.s, z18.h -; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: sunpklo z18.s, z18.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: sunpklo z20.h, z4.b -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: sunpklo z20.h, z3.b +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: sunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 ; CHECK-NEXT: sunpklo z20.s, z20.h ; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: sunpklo z18.h, z5.b -; CHECK-NEXT: uzp1 z7.h, z19.h, z19.h +; CHECK-NEXT: sunpklo z18.h, z4.b +; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h ; CHECK-NEXT: sunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: sunpklo z18.s, z18.h @@ -171,21 +172,21 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z17.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z6.h -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h +; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b +; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b +; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z6.b, z19.b, z19.b -; CHECK-NEXT: splice z7.b, p0, z7.b, z2.b -; CHECK-NEXT: splice z6.b, p0, z6.b, z3.b -; CHECK-NEXT: movprfx z2, z4 -; CHECK-NEXT: mls z2.b, p1/m, z6.b, z5.b -; CHECK-NEXT: mls z0.b, p1/m, z7.b, z1.b +; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b +; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b +; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b +; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -503,19 +504,19 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: uunpklo z6.h, z1.b -; CHECK-NEXT: uunpklo z7.h, z0.b +; CHECK-NEXT: uunpklo z7.h, z1.b +; CHECK-NEXT: uunpklo z16.h, z0.b ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 -; CHECK-NEXT: uunpklo z16.s, z6.h -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: uunpklo z17.s, z7.h +; CHECK-NEXT: uunpklo z6.s, z7.h ; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 +; CHECK-NEXT: uunpklo z17.s, z16.h +; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 ; CHECK-NEXT: uunpklo z4.h, z2.b ; CHECK-NEXT: uunpklo z3.h, z3.b -; CHECK-NEXT: uunpklo z6.s, z6.h ; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s +; CHECK-NEXT: uunpklo z16.s, z16.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z17.s ; CHECK-NEXT: uunpklo z2.s, z4.h ; CHECK-NEXT: uunpklo z5.s, z3.h ; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 @@ -523,34 +524,35 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s -; CHECK-NEXT: ldr q5, [x1] -; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: ext z17.b, z17.b, z5.b, #8 +; CHECK-NEXT: movprfx z5, z3 +; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z4.s +; CHECK-NEXT: ldr q3, [x0] +; CHECK-NEXT: ldr q4, [x1] +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: mov z18.d, z3.d +; CHECK-NEXT: mov z17.d, z4.d +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: ext z18.b, z18.b, z3.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z4.b, #8 +; CHECK-NEXT: uunpklo z18.h, z18.b ; CHECK-NEXT: uunpklo z17.h, z17.b -; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s -; CHECK-NEXT: ldr q4, [x0] +; CHECK-NEXT: uunpklo z20.s, z18.h +; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: uunpklo z19.s, z17.h ; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 -; CHECK-NEXT: mov z18.d, z4.d +; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s +; CHECK-NEXT: uunpklo z18.s, z18.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uunpklo z17.s, z17.h -; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 -; CHECK-NEXT: uunpklo z18.h, z18.b -; CHECK-NEXT: uunpklo z20.s, z18.h -; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: uunpklo z18.s, z18.h -; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s -; CHECK-NEXT: uunpklo z20.h, z4.b -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: uunpklo z20.h, z3.b +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: uunpklo z22.s, z20.h ; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 ; CHECK-NEXT: uunpklo z20.s, z20.h ; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s -; CHECK-NEXT: uunpklo z18.h, z5.b -; CHECK-NEXT: uzp1 z7.h, z19.h, z19.h +; CHECK-NEXT: uunpklo z18.h, z4.b +; CHECK-NEXT: uzp1 z16.h, z19.h, z19.h ; CHECK-NEXT: uunpklo z21.s, z18.h ; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 ; CHECK-NEXT: uunpklo z18.s, z18.h @@ -559,21 +561,21 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h -; CHECK-NEXT: splice z7.h, p0, z7.h, z17.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z6.h -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b +; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h +; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b +; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b +; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z6.b, z19.b, z19.b -; CHECK-NEXT: splice z7.b, p0, z7.b, z2.b -; CHECK-NEXT: splice z6.b, p0, z6.b, z3.b -; CHECK-NEXT: movprfx z2, z4 -; CHECK-NEXT: mls z2.b, p1/m, z6.b, z5.b -; CHECK-NEXT: mls z0.b, p1/m, z7.b, z1.b +; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b +; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b +; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b +; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index 40bc43791c45a..1f036fa08ef15 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -92,12 +92,13 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z16.h, z3.h[5] ; CHECK-NEXT: fmov w9, s17 ; CHECK-NEXT: mov z17.h, z4.h[5] +; CHECK-NEXT: mov z20.h, z7.h[6] ; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z3.h[4] ; CHECK-NEXT: strh w9, [sp, #28] ; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: mov z19.h, z7.h[6] +; CHECK-NEXT: mov z19.h, z6.h[7] ; CHECK-NEXT: zip1 z3.h, z4.h, z3.h ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s16 @@ -133,66 +134,65 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 z1.h, z2.h, z5.h ; CHECK-NEXT: strh w8, [sp, #54] ; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z7.h[7] +; CHECK-NEXT: ldr q16, [sp, #16] ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: strh w8, [sp, #52] ; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z6.h[7] ; CHECK-NEXT: strh w8, [sp, #50] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: ldr q18, [sp, #16] +; CHECK-NEXT: mov z18.h, z7.h[7] ; CHECK-NEXT: strh w8, [sp, #48] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z6.h[6] -; CHECK-NEXT: ldr q20, [sp, #48] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z6.h[6] +; CHECK-NEXT: ldr q17, [sp, #48] ; CHECK-NEXT: strh w8, [sp, #46] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z7.h[5] -; CHECK-NEXT: strh w8, [sp, #44] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z6.h[5] +; CHECK-NEXT: mov z19.h, z7.h[5] +; CHECK-NEXT: strh w8, [sp, #44] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: mov z20.h, z6.h[5] ; CHECK-NEXT: strh w8, [sp, #42] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z7.h[4] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z7.h[4] ; CHECK-NEXT: strh w8, [sp, #40] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z6.h[4] -; CHECK-NEXT: strh w8, [sp, #38] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[7] +; CHECK-NEXT: mov z19.h, z6.h[4] +; CHECK-NEXT: strh w8, [sp, #38] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: mov z20.h, z5.h[7] ; CHECK-NEXT: strh w8, [sp, #36] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z2.h[7] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z2.h[7] ; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z5.h[6] -; CHECK-NEXT: strh w8, [sp, #32] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z2.h[6] +; CHECK-NEXT: mov z19.h, z5.h[6] +; CHECK-NEXT: strh w8, [sp, #32] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: mov z20.h, z2.h[6] ; CHECK-NEXT: strh w8, [sp, #14] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z5.h[5] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z5.h[5] ; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: mov z17.h, z2.h[5] -; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[4] -; CHECK-NEXT: fmov w9, s17 +; CHECK-NEXT: mov z19.h, z2.h[5] +; CHECK-NEXT: strh w8, [sp, #10] +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: mov z20.h, z5.h[4] +; CHECK-NEXT: fmov w9, s19 ; CHECK-NEXT: strh w8, [sp, #8] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: mov z16.h, z2.h[4] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z18.h, z2.h[4] ; CHECK-NEXT: strh w9, [sp, #4] ; CHECK-NEXT: ldr q2, [sp, #32] ; CHECK-NEXT: strh w8, [sp, #6] -; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: add z2.h, z18.h, z2.h +; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: add z2.h, z16.h, z2.h ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: strh w8, [sp] ; CHECK-NEXT: ldr q4, [sp] ; CHECK-NEXT: stp q3, q2, [x0, #32] -; CHECK-NEXT: add z1.h, z20.h, z4.h +; CHECK-NEXT: add z1.h, z17.h, z4.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret @@ -956,36 +956,39 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q1, q3, [x0] -; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q0, q1, [x1] ; CHECK-NEXT: mov z4.h, z3.h[6] ; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: mov z6.h, z3.h[2] ; CHECK-NEXT: mov z5.h, z3.h[4] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z7.h, z1.h[6] +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z7.h, z2.h[6] +; CHECK-NEXT: mov z17.h, z2.h[7] +; CHECK-NEXT: mov z16.h, z3.h[1] ; CHECK-NEXT: strh w8, [sp, #40] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[4] +; CHECK-NEXT: mov z4.h, z2.h[4] ; CHECK-NEXT: strh w9, [sp, #32] ; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z1.h[2] +; CHECK-NEXT: mov z5.h, z2.h[2] ; CHECK-NEXT: strh w8, [sp, #46] ; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: mov z6.h, z2.h[2] +; CHECK-NEXT: mov z6.h, z1.h[2] ; CHECK-NEXT: strh w9, [sp, #44] ; CHECK-NEXT: fmov w9, s7 ; CHECK-NEXT: mov z7.h, z0.h[6] ; CHECK-NEXT: strh w8, [sp, #42] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z2.h[6] +; CHECK-NEXT: mov z4.h, z1.h[6] ; CHECK-NEXT: strh w9, [sp, #38] +; CHECK-NEXT: fmov w9, s16 ; CHECK-NEXT: strh w8, [sp, #36] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z2.h[4] +; CHECK-NEXT: mov z5.h, z1.h[4] +; CHECK-NEXT: strh w9, [sp, #56] ; CHECK-NEXT: strh w8, [sp, #34] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: ldr q16, [sp, #32] +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: strh w8, [sp] @@ -999,63 +1002,60 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: mov z6.h, z3.h[7] ; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: mov z7.h, z1.h[7] +; CHECK-NEXT: mov z7.h, z3.h[5] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z3.h[5] ; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: fmov w8, s5 ; CHECK-NEXT: mov z5.h, z3.h[3] -; CHECK-NEXT: mov z3.h, z3.h[1] +; CHECK-NEXT: ldr q3, [sp, #32] ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.h, z2.h[7] -; CHECK-NEXT: ldr q6, [sp] +; CHECK-NEXT: mov z6.h, z2.h[5] +; CHECK-NEXT: ldr q4, [sp] ; CHECK-NEXT: strh w8, [sp, #62] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[5] -; CHECK-NEXT: strh w9, [sp, #56] +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z7.h, z1.h[7] ; CHECK-NEXT: strh w8, [sp, #60] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] -; CHECK-NEXT: mov z1.h, z1.h[1] +; CHECK-NEXT: mov z5.h, z2.h[3] +; CHECK-NEXT: mov z2.h, z2.h[1] ; CHECK-NEXT: strh w8, [sp, #58] -; CHECK-NEXT: fmov w8, s7 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z1.h, z2.h[1] +; CHECK-NEXT: fmov w8, s17 +; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: mov z2.h, z0.h[7] ; CHECK-NEXT: strh w8, [sp, #54] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z2.h[5] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z6.h, z1.h[5] ; CHECK-NEXT: strh w9, [sp, #48] ; CHECK-NEXT: strh w8, [sp, #52] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z0.h[7] +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z1.h, z1.h[1] ; CHECK-NEXT: strh w8, [sp, #50] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z3.h, z0.h[5] +; CHECK-NEXT: fmov w8, s7 ; CHECK-NEXT: strh w8, [sp, #30] -; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z6.h, z0.h[5] ; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strh w8, [sp, #24] ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #22] -; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: fmov w8, s6 ; CHECK-NEXT: strh w8, [sp, #20] -; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: fmov w8, s5 ; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: add z0.h, z16.h, z0.h +; CHECK-NEXT: add z0.h, z3.h, z0.h ; CHECK-NEXT: strh w8, [sp, #16] ; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: add z1.h, z6.h, z1.h +; CHECK-NEXT: add z1.h, z4.h, z1.h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret @@ -1133,45 +1133,45 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: mov z2.h, z0.h[6] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z4.h, z0.h[2] -; CHECK-NEXT: mov z6.h, z1.h[4] -; CHECK-NEXT: mov z3.h, z0.h[4] -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov z5.h, z1.h[6] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z4.h, z1.h[2] +; CHECK-NEXT: mov z6.h, z0.h[4] +; CHECK-NEXT: mov z3.h, z1.h[4] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov z5.h, z0.h[6] ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: mov z2.h, z0.h[2] ; CHECK-NEXT: strh w9, [sp] ; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z3.h, z0.h[7] +; CHECK-NEXT: mov z3.h, z1.h[7] ; CHECK-NEXT: strh w8, [sp, #14] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z0.h[5] +; CHECK-NEXT: mov z4.h, z1.h[5] ; CHECK-NEXT: strh w9, [sp, #12] ; CHECK-NEXT: fmov w9, s5 -; CHECK-NEXT: mov z5.h, z0.h[3] -; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z1.h, z1.h[1] ; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s6 ; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z0.h, z1.h[1] +; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.h, z1.h[7] +; CHECK-NEXT: mov z2.h, z0.h[7] ; CHECK-NEXT: strh w9, [sp, #24] ; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s4 -; CHECK-NEXT: mov z4.h, z1.h[5] +; CHECK-NEXT: mov z4.h, z0.h[5] ; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov z5.h, z1.h[3] +; CHECK-NEXT: mov z5.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #22] diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll index 8b48635b6694c..f0856c43daf1d 100644 --- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll +++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll @@ -26,7 +26,21 @@ define i16 @uaddlv_uaddlp_v16i8(<16 x i8> %0) { ret i16 %4 } +define i16 @uaddlv_uaddlp_v8i8(<8 x i8> %0) { +; CHECK-LABEL: uaddlv_uaddlp_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %2 = tail call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %0) + %3 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %2) + %4 = trunc i32 %3 to i16 + ret i16 %4 +} + declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>) declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>) +declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>) declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) +declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 7623e35654162..61439021a8875 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -5828,29 +5828,28 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg ; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v1, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 6, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v6, v7, v0 :: v_dual_cndmask_b32 v5, v8, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v0 :: v_dual_cndmask_b32 v6, v8, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6 ; GFX11-NEXT: v_readfirstlane_b32 s0, v18 ; GFX11-NEXT: v_readfirstlane_b32 s1, v17 ; GFX11-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v4 ; GFX11-NEXT: v_dual_cndmask_b32 v7, v9, v0 :: v_dual_cndmask_b32 v8, v10, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v11, v0, s6 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, v1, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v15, v0, s9 -; GFX11-NEXT: v_readfirstlane_b32 s3, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v13, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v1, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-NEXT: v_readfirstlane_b32 s5, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v11, v13, v0 :: v_dual_cndmask_b32 v12, v14, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v15, v0, s9 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v16, v1, s9 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 ; GFX11-NEXT: v_readfirstlane_b32 s6, v7 ; GFX11-NEXT: v_readfirstlane_b32 s7, v8 -; GFX11-NEXT: v_readfirstlane_b32 s8, v11 +; GFX11-NEXT: v_readfirstlane_b32 s8, v10 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 -; GFX11-NEXT: v_readfirstlane_b32 s10, v10 -; GFX11-NEXT: v_readfirstlane_b32 s11, v14 -; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s10, v11 +; GFX11-NEXT: v_readfirstlane_b32 s11, v12 +; GFX11-NEXT: v_readfirstlane_b32 s12, v14 ; GFX11-NEXT: v_readfirstlane_b32 s13, v13 ; GFX11-NEXT: ; return to shader part epilog entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir index 1a70e500d36f5..b97d04e809a6f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir @@ -18,16 +18,19 @@ body: | ; GFX6-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[COPY]] ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[INTRINSIC_TRUNC]] ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[FSUB]] - ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX6-NEXT: $vgpr0 = COPY [[FADD]](s32) + ; ; GFX8-LABEL: name: test_intrinsic_round_s32 ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -35,16 +38,19 @@ body: | ; GFX8-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[COPY]] ; GFX8-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[FSUB]] - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: $vgpr0 = COPY [[FADD]](s32) + ; ; GFX9-LABEL: name: test_intrinsic_round_s32 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -52,15 +58,17 @@ body: | ; GFX9-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[COPY]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[FSUB]] - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_INTRINSIC_ROUND %0 @@ -80,16 +88,19 @@ body: | ; GFX6-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = nsz G_INTRINSIC_TRUNC [[COPY]] ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nsz G_FSUB [[COPY]], [[INTRINSIC_TRUNC]] ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s32) = nsz G_FABS [[FSUB]] - ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nsz G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nsz G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nsz G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX6-NEXT: $vgpr0 = COPY [[FADD]](s32) + ; ; GFX8-LABEL: name: test_intrinsic_round_s32_flags ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -97,16 +108,19 @@ body: | ; GFX8-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = nsz G_INTRINSIC_TRUNC [[COPY]] ; GFX8-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nsz G_FSUB [[COPY]], [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s32) = nsz G_FABS [[FSUB]] - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nsz G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nsz G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nsz G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: $vgpr0 = COPY [[FADD]](s32) + ; ; GFX9-LABEL: name: test_intrinsic_round_s32_flags ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -114,15 +128,17 @@ body: | ; GFX9-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = nsz G_INTRINSIC_TRUNC [[COPY]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = nsz G_FSUB [[COPY]], [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s32) = nsz G_FABS [[FSUB]] - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nsz G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = nsz G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = nsz G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = nsz G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = nsz G_INTRINSIC_ROUND %0 @@ -162,16 +178,19 @@ body: | ; GFX6-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT1]] ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]] ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[FADD]] - ; GFX6-NEXT: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 - ; GFX6-NEXT: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 - ; GFX6-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 - ; GFX6-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 4607182418800017408 - ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C10]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[C11]], [[AND2]] - ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C9]] - ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[OR]], [[C8]] - ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[SELECT1]], [[SELECT2]] + ; GFX6-NEXT: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C8]] + ; GFX6-NEXT: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX6-NEXT: [[C10:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[C9]], [[C10]] + ; GFX6-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 + ; GFX6-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT2]], [[C12]] + ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C11]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND2]], [[AND3]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[SELECT1]], [[OR]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[FADD1]](s64) + ; ; GFX8-LABEL: name: test_intrinsic_round_s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -180,16 +199,19 @@ body: | ; GFX8-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[FADD]] - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4607182418800017408 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C2]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[C3]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C1]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C4]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C3]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FADD1]](s64) + ; ; GFX9-LABEL: name: test_intrinsic_round_s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -198,15 +220,17 @@ body: | ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[FADD]] - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4607182418800017408 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C2]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[C3]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C4]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C3]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FADD1]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_INTRINSIC_ROUND %0 @@ -227,25 +251,29 @@ body: | ; GFX6-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV]] ; GFX6-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[UV]], [[INTRINSIC_TRUNC]] ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[FSUB]] - ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C2]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C3]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX6-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV1]] ; GFX6-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[UV1]], [[INTRINSIC_TRUNC1]] ; GFX6-NEXT: [[FABS1:%[0-9]+]]:_(s32) = G_FABS [[FSUB1]] - ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]] - ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND1]] - ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s32), [[C1]] - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C]] - ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s32), [[C]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C4]] + ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C3]] + ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[AND3]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; GFX8-LABEL: name: test_intrinsic_round_v2s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -254,25 +282,29 @@ body: | ; GFX8-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV]] ; GFX8-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[UV]], [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[FSUB]] - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C2]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C3]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV1]] ; GFX8-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[UV1]], [[INTRINSIC_TRUNC1]] ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s32) = G_FABS [[FSUB1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]] - ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND1]] - ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s32), [[C1]] - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C]] - ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s32), [[C]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C4]] + ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C3]] + ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[AND3]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; ; GFX9-LABEL: name: test_intrinsic_round_v2s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -281,23 +313,26 @@ body: | ; GFX9-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[UV]], [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[FSUB]] - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C2]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 5.000000e-01 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s32), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C4]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C3]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s32) = G_INTRINSIC_TRUNC [[UV1]] ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[UV1]], [[INTRINSIC_TRUNC1]] ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s32) = G_FABS [[FSUB1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]] - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C3]], [[AND1]] - ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s32), [[C1]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C]] - ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s32), [[C]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C4]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C3]] + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[AND3]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -339,37 +374,41 @@ body: | ; GFX6-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[SELECT1]] ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[UV]], [[FNEG]] ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[FADD]] - ; GFX6-NEXT: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 - ; GFX6-NEXT: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 - ; GFX6-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 - ; GFX6-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 4607182418800017408 - ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C10]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[C11]], [[AND2]] - ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C9]] - ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[OR]], [[C8]] - ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[SELECT1]], [[SELECT2]] + ; GFX6-NEXT: [[C8:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C8]] + ; GFX6-NEXT: [[C9:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX6-NEXT: [[C10:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[C9]], [[C10]] + ; GFX6-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 + ; GFX6-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT2]], [[C12]] + ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C11]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND2]], [[AND3]] + ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[SELECT1]], [[OR]] ; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ubfe), [[UV5]](s32), [[C]](s32), [[C1]](s32) ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[INT1]], [[C2]] - ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C3]] - ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C5]](s32), [[AND3]](s32) + ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C3]] + ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C5]](s32), [[AND4]](s32) ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[C4]], [[SUB1]](s32) ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s64) = G_XOR [[ASHR1]], [[C6]] - ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[XOR1]] + ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[XOR1]] ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB1]](s32), [[C5]] ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[SUB1]](s32), [[C7]] - ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[MV1]], [[AND4]] + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[MV1]], [[AND5]] ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV1]], [[SELECT3]] ; GFX6-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[SELECT4]] ; GFX6-NEXT: [[FADD2:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]] ; GFX6-NEXT: [[FABS1:%[0-9]+]]:_(s64) = G_FABS [[FADD2]] - ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C10]] - ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[C11]], [[AND5]] - ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s64), [[C9]] - ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C8]] - ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[SELECT4]], [[SELECT5]] + ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s64), [[C8]] + ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[C9]], [[C10]] + ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[SELECT5]], [[C12]] + ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C11]] + ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND6]], [[AND7]] + ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[SELECT4]], [[OR1]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD1]](s64), [[FADD3]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; ; GFX8-LABEL: name: test_intrinsic_round_v2s64 ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} @@ -379,26 +418,30 @@ body: | ; GFX8-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[UV]], [[FNEG]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[FADD]] - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4607182418800017408 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C2]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[C3]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C1]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C4]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C3]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s64) = G_INTRINSIC_TRUNC [[UV1]] ; GFX8-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[INTRINSIC_TRUNC1]] ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]] ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s64) = G_FABS [[FADD2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C2]] - ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[C3]], [[AND1]] - ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s64), [[C1]] - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C]] - ; GFX8-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s64), [[C]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT1]], [[C4]] + ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C3]] + ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND2]], [[AND3]] + ; GFX8-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD1]](s64), [[FADD3]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; ; GFX9-LABEL: name: test_intrinsic_round_v2s64 ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -408,24 +451,27 @@ body: | ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[UV]], [[FNEG]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[FADD]] - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4607182418800017408 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C2]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[C3]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s64), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C4]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[UV]], [[C3]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s64) = G_INTRINSIC_TRUNC [[UV1]] ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[INTRINSIC_TRUNC1]] ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]] ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s64) = G_FABS [[FADD2]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C2]] - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[C3]], [[AND1]] - ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s64), [[C1]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C]] - ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s64), [[C]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[FCMP1]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT1]], [[C4]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[UV1]], [[C3]] + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND2]], [[AND3]] + ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s64) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD1]](s64), [[FADD3]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 @@ -453,22 +499,25 @@ body: | ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FPEXT1]], [[FPEXT2]] ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC1]] - ; GFX6-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C3]], [[AND]] + ; GFX6-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 ; GFX6-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[FABS]](s16) - ; GFX6-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) + ; GFX6-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT3]](s32), [[FPEXT4]] - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C4]] + ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] ; GFX6-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16) - ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT]](s16) + ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[OR]](s16) ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT5]], [[FPEXT6]] ; GFX6-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC2]](s16) ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX8-LABEL: name: test_intrinsic_round_s16 ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -477,17 +526,20 @@ body: | ; GFX8-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX8-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FSUB]] - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C3]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C1]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C4]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; ; GFX9-LABEL: name: test_intrinsic_round_s16 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -496,15 +548,17 @@ body: | ; GFX9-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FSUB]] - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C3]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C1]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C]] - ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C4]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -538,18 +592,20 @@ body: | ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FPEXT1]], [[FPEXT2]] ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC1]] - ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 ; GFX6-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[FABS]](s16) - ; GFX6-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT3]](s32), [[FPEXT4]] - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] ; GFX6-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16) - ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT]](s16) + ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[OR]](s16) ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT5]], [[FPEXT6]] ; GFX6-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) ; GFX6-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) @@ -561,14 +617,15 @@ body: | ; GFX6-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FPEXT8]], [[FPEXT9]] ; GFX6-NEXT: [[FPTRUNC4:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32) ; GFX6-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC4]] - ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] ; GFX6-NEXT: [[FPEXT10:%[0-9]+]]:_(s32) = G_FPEXT [[FABS1]](s16) - ; GFX6-NEXT: [[FPEXT11:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT11:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT10]](s32), [[FPEXT11]] - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] ; GFX6-NEXT: [[FPEXT12:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC3]](s16) - ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT1]](s16) + ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[OR1]](s16) ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FPEXT12]], [[FPEXT13]] ; GFX6-NEXT: [[FPTRUNC5:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC2]](s16) @@ -577,6 +634,7 @@ body: | ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX6-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) + ; ; GFX8-LABEL: name: test_intrinsic_round_v2s16 ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -589,29 +647,33 @@ body: | ; GFX8-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX8-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FSUB]] - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C1]] + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]] ; GFX8-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]] ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] - ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] - ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C1]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16) ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16) ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) + ; ; GFX9-LABEL: name: test_intrinsic_round_v2s16 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} @@ -624,23 +686,26 @@ body: | ; GFX9-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FSUB]] - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] - ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C1]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]] ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]] ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] - ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] - ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C1]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -674,18 +739,20 @@ body: | ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FPEXT1]], [[FPEXT2]] ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC1]] - ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 ; GFX6-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[FABS]](s16) - ; GFX6-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT3]](s32), [[FPEXT4]] - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] ; GFX6-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16) - ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT]](s16) + ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[OR]](s16) ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT5]], [[FPEXT6]] ; GFX6-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) ; GFX6-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) @@ -697,14 +764,15 @@ body: | ; GFX6-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FPEXT8]], [[FPEXT9]] ; GFX6-NEXT: [[FPTRUNC4:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32) ; GFX6-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC4]] - ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] ; GFX6-NEXT: [[FPEXT10:%[0-9]+]]:_(s32) = G_FPEXT [[FABS1]](s16) - ; GFX6-NEXT: [[FPEXT11:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT11:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT10]](s32), [[FPEXT11]] - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] ; GFX6-NEXT: [[FPEXT12:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC3]](s16) - ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT1]](s16) + ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[OR1]](s16) ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FPEXT12]], [[FPEXT13]] ; GFX6-NEXT: [[FPTRUNC5:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; GFX6-NEXT: [[FPEXT14:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) @@ -716,14 +784,15 @@ body: | ; GFX6-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FPEXT15]], [[FPEXT16]] ; GFX6-NEXT: [[FPTRUNC7:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD4]](s32) ; GFX6-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC7]] - ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] - ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND2]] ; GFX6-NEXT: [[FPEXT17:%[0-9]+]]:_(s32) = G_FPEXT [[FABS2]](s16) - ; GFX6-NEXT: [[FPEXT18:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT18:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT17]](s32), [[FPEXT18]] - ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[OR2]], [[C1]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]] + ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]] ; GFX6-NEXT: [[FPEXT19:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC6]](s16) - ; GFX6-NEXT: [[FPEXT20:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT2]](s16) + ; GFX6-NEXT: [[FPEXT20:%[0-9]+]]:_(s32) = G_FPEXT [[OR2]](s16) ; GFX6-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FPEXT19]], [[FPEXT20]] ; GFX6-NEXT: [[FPTRUNC8:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD5]](s32) ; GFX6-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF @@ -737,18 +806,19 @@ body: | ; GFX6-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) ; GFX6-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC8]](s16) - ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C5]] - ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C6]] + ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) ; GFX6-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) - ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]] - ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C5]] - ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) - ; GFX6-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C6]] + ; GFX6-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C6]] + ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; GFX6-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL2]] ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) + ; ; GFX8-LABEL: name: test_intrinsic_round_v3s16 ; GFX8: liveins: $vgpr0_vgpr1_vgpr2 ; GFX8-NEXT: {{ $}} @@ -764,31 +834,35 @@ body: | ; GFX8-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX8-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FSUB]] - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C1]] + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]] ; GFX8-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]] ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] - ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] - ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C1]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX8-NEXT: [[INTRINSIC_TRUNC2:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC2]] ; GFX8-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[INTRINSIC_TRUNC2]] ; GFX8-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FSUB2]] - ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] - ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND2]] - ; GFX8-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS2]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[OR2]], [[C1]] - ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[SELECT2]] + ; GFX8-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS2]](s16), [[C1]] + ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]] + ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]] + ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[OR2]] ; GFX8-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX8-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) @@ -800,18 +874,19 @@ body: | ; GFX8-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) ; GFX8-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[FADD2]](s16) - ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C5]] - ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C6]] + ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) ; GFX8-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) - ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]] - ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C5]] - ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) - ; GFX8-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C6]] + ; GFX8-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C6]] + ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; GFX8-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL2]] ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) + ; ; GFX9-LABEL: name: test_intrinsic_round_v3s16 ; GFX9: liveins: $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: {{ $}} @@ -827,31 +902,35 @@ body: | ; GFX9-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FSUB]] - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] - ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C1]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]] ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]] ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] - ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] - ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C1]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX9-NEXT: [[INTRINSIC_TRUNC2:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC2]] ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[INTRINSIC_TRUNC2]] ; GFX9-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FSUB2]] - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND2]] - ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS2]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[OR2]], [[C1]] - ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[SELECT2]] + ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS2]](s16), [[C1]] + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]] + ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[OR2]] ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) @@ -902,18 +981,20 @@ body: | ; GFX6-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FPEXT1]], [[FPEXT2]] ; GFX6-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) ; GFX6-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC1]] - ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] + ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 ; GFX6-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[FABS]](s16) - ; GFX6-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT3]](s32), [[FPEXT4]] - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] + ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] ; GFX6-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16) - ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT]](s16) + ; GFX6-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[OR]](s16) ; GFX6-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FPEXT5]], [[FPEXT6]] ; GFX6-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) ; GFX6-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) @@ -925,14 +1006,15 @@ body: | ; GFX6-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FPEXT8]], [[FPEXT9]] ; GFX6-NEXT: [[FPTRUNC4:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32) ; GFX6-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC4]] - ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] ; GFX6-NEXT: [[FPEXT10:%[0-9]+]]:_(s32) = G_FPEXT [[FABS1]](s16) - ; GFX6-NEXT: [[FPEXT11:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT11:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT10]](s32), [[FPEXT11]] - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] ; GFX6-NEXT: [[FPEXT12:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC3]](s16) - ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT1]](s16) + ; GFX6-NEXT: [[FPEXT13:%[0-9]+]]:_(s32) = G_FPEXT [[OR1]](s16) ; GFX6-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[FPEXT12]], [[FPEXT13]] ; GFX6-NEXT: [[FPTRUNC5:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) ; GFX6-NEXT: [[FPEXT14:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) @@ -944,14 +1026,15 @@ body: | ; GFX6-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FPEXT15]], [[FPEXT16]] ; GFX6-NEXT: [[FPTRUNC7:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD4]](s32) ; GFX6-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC7]] - ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] - ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND2]] ; GFX6-NEXT: [[FPEXT17:%[0-9]+]]:_(s32) = G_FPEXT [[FABS2]](s16) - ; GFX6-NEXT: [[FPEXT18:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT18:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT17]](s32), [[FPEXT18]] - ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[OR2]], [[C1]] + ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]] + ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]] ; GFX6-NEXT: [[FPEXT19:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC6]](s16) - ; GFX6-NEXT: [[FPEXT20:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT2]](s16) + ; GFX6-NEXT: [[FPEXT20:%[0-9]+]]:_(s32) = G_FPEXT [[OR2]](s16) ; GFX6-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FPEXT19]], [[FPEXT20]] ; GFX6-NEXT: [[FPTRUNC8:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD5]](s32) ; GFX6-NEXT: [[FPEXT21:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) @@ -963,14 +1046,15 @@ body: | ; GFX6-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FPEXT22]], [[FPEXT23]] ; GFX6-NEXT: [[FPTRUNC10:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD6]](s32) ; GFX6-NEXT: [[FABS3:%[0-9]+]]:_(s16) = G_FABS [[FPTRUNC10]] - ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; GFX6-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND3]] ; GFX6-NEXT: [[FPEXT24:%[0-9]+]]:_(s32) = G_FPEXT [[FABS3]](s16) - ; GFX6-NEXT: [[FPEXT25:%[0-9]+]]:_(s32) = G_FPEXT [[C2]](s16) + ; GFX6-NEXT: [[FPEXT25:%[0-9]+]]:_(s32) = G_FPEXT [[C1]](s16) ; GFX6-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FPEXT24]](s32), [[FPEXT25]] - ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[OR3]], [[C1]] + ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[C2]], [[C3]] + ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[SELECT3]], [[C5]] + ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; GFX6-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[AND7]] ; GFX6-NEXT: [[FPEXT26:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC9]](s16) - ; GFX6-NEXT: [[FPEXT27:%[0-9]+]]:_(s32) = G_FPEXT [[SELECT3]](s16) + ; GFX6-NEXT: [[FPEXT27:%[0-9]+]]:_(s32) = G_FPEXT [[OR3]](s16) ; GFX6-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[FPEXT26]], [[FPEXT27]] ; GFX6-NEXT: [[FPTRUNC11:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) ; GFX6-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC2]](s16) @@ -985,6 +1069,7 @@ body: | ; GFX6-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; ; GFX8-LABEL: name: test_intrinsic_round_v4s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1002,39 +1087,44 @@ body: | ; GFX8-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX8-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[INTRINSIC_TRUNC]] ; GFX8-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FSUB]] - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] - ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] - ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 + ; GFX8-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C1]] + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] + ; GFX8-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX8-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]] ; GFX8-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]] ; GFX8-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] - ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] - ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX8-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C1]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] + ; GFX8-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX8-NEXT: [[INTRINSIC_TRUNC2:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC2]] ; GFX8-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[INTRINSIC_TRUNC2]] ; GFX8-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FSUB2]] - ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] - ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND2]] - ; GFX8-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS2]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[OR2]], [[C1]] - ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[SELECT2]] + ; GFX8-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS2]](s16), [[C1]] + ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]] + ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]] + ; GFX8-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[OR2]] ; GFX8-NEXT: [[INTRINSIC_TRUNC3:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC3]] ; GFX8-NEXT: [[FSUB3:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC3]], [[INTRINSIC_TRUNC3]] ; GFX8-NEXT: [[FABS3:%[0-9]+]]:_(s16) = G_FABS [[FSUB3]] - ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; GFX8-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND3]] - ; GFX8-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS3]](s16), [[C2]] - ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[OR3]], [[C1]] - ; GFX8-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], [[SELECT3]] + ; GFX8-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS3]](s16), [[C1]] + ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[C2]], [[C3]] + ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[SELECT3]], [[C5]] + ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; GFX8-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[AND7]] + ; GFX8-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], [[OR3]] ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FADD]](s16) ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FADD1]](s16) ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) @@ -1047,6 +1137,7 @@ body: | ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; ; GFX9-LABEL: name: test_intrinsic_round_v4s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -1064,39 +1155,44 @@ body: | ; GFX9-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX9-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[INTRINSIC_TRUNC]] ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[FSUB]] - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 15360 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND]] - ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[OR]], [[C1]] - ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[SELECT]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3800 + ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS]](s16), [[C1]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[SELECT]], [[C5]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC]], [[OR]] ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]] ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[INTRINSIC_TRUNC1]] ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(s16) = G_FABS [[FSUB1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND1]] - ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] - ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] + ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C1]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SELECT1]], [[C5]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[AND3]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[OR1]] ; GFX9-NEXT: [[INTRINSIC_TRUNC2:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC2]] ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[INTRINSIC_TRUNC2]] ; GFX9-NEXT: [[FABS2:%[0-9]+]]:_(s16) = G_FABS [[FSUB2]] - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND2]] - ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS2]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[OR2]], [[C1]] - ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[SELECT2]] + ; GFX9-NEXT: [[FCMP2:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS2]](s16), [[C1]] + ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[FCMP2]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SELECT2]], [[C5]] + ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[AND5]] + ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC2]], [[OR2]] ; GFX9-NEXT: [[INTRINSIC_TRUNC3:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC3]] ; GFX9-NEXT: [[FSUB3:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC3]], [[INTRINSIC_TRUNC3]] ; GFX9-NEXT: [[FABS3:%[0-9]+]]:_(s16) = G_FABS [[FSUB3]] - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[C4]], [[AND3]] - ; GFX9-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS3]](s16), [[C2]] - ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[OR3]], [[C1]] - ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], [[SELECT3]] + ; GFX9-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS3]](s16), [[C1]] + ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[C2]], [[C3]] + ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[SELECT3]], [[C5]] + ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[AND7]] + ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], [[OR3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD2]](s16), [[FADD3]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 0255a77aa0ffd..eb3f74be71de0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -838,165 +838,165 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: v_readfirstlane_b32 s21, v2 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_readfirstlane_b32 s23, v1 -; GFX7-NEXT: v_readfirstlane_b32 s19, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX7-NEXT: v_mul_hi_u32 v3, v2, s8 -; GFX7-NEXT: v_mov_b32_e32 v4, s11 ; GFX7-NEXT: s_mul_i32 s18, s16, s10 -; GFX7-NEXT: v_readfirstlane_b32 s24, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: v_readfirstlane_b32 s22, v3 -; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX7-NEXT: s_mul_i32 s20, s1, s9 -; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4 +; GFX7-NEXT: v_readfirstlane_b32 s19, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_add_u32 s18, s20, s18 -; GFX7-NEXT: v_readfirstlane_b32 s25, v3 -; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10 ; GFX7-NEXT: s_addc_u32 s19, s21, s19 ; GFX7-NEXT: s_mul_i32 s21, s2, s8 +; GFX7-NEXT: v_readfirstlane_b32 s23, v1 +; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX7-NEXT: s_cselect_b32 s20, 1, 0 +; GFX7-NEXT: v_readfirstlane_b32 s22, v3 ; GFX7-NEXT: s_add_u32 s18, s21, s18 -; GFX7-NEXT: v_readfirstlane_b32 s28, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_addc_u32 s19, s22, s19 ; GFX7-NEXT: s_mul_i32 s22, s16, s9 -; GFX7-NEXT: v_readfirstlane_b32 s27, v5 -; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: s_add_u32 s17, s22, s17 -; GFX7-NEXT: s_addc_u32 s18, s23, s18 -; GFX7-NEXT: s_mul_i32 s23, s1, s8 -; GFX7-NEXT: s_cselect_b32 s22, 1, 0 -; GFX7-NEXT: s_add_u32 s17, s23, s17 -; GFX7-NEXT: s_addc_u32 s18, s24, s18 -; GFX7-NEXT: s_mul_i32 s24, s16, s12 -; GFX7-NEXT: s_mul_i32 s26, s1, s11 +; GFX7-NEXT: s_addc_u32 s22, s23, s18 +; GFX7-NEXT: v_readfirstlane_b32 s23, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX7-NEXT: s_mul_i32 s18, s1, s8 +; GFX7-NEXT: s_cselect_b32 s25, 1, 0 +; GFX7-NEXT: s_add_u32 s18, s18, s17 +; GFX7-NEXT: s_addc_u32 s17, s23, s22 +; GFX7-NEXT: v_mov_b32_e32 v4, s11 +; GFX7-NEXT: v_readfirstlane_b32 s23, v3 +; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10 +; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4 +; GFX7-NEXT: s_mul_i32 s22, s16, s12 +; GFX7-NEXT: s_mul_i32 s24, s1, s11 +; GFX7-NEXT: v_readfirstlane_b32 s28, v3 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_readfirstlane_b32 s27, v5 +; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 +; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_add_u32 s24, s24, s22 +; GFX7-NEXT: s_addc_u32 s23, s27, s23 ; GFX7-NEXT: v_readfirstlane_b32 s29, v5 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: s_cselect_b32 s23, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s26, s24 ; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8 -; GFX7-NEXT: s_addc_u32 s25, s27, s25 ; GFX7-NEXT: s_mul_i32 s27, s2, s10 -; GFX7-NEXT: s_cselect_b32 s26, 1, 0 +; GFX7-NEXT: s_cselect_b32 s22, 1, 0 ; GFX7-NEXT: s_add_u32 s24, s27, s24 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX7-NEXT: s_addc_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s27, s28, s23 ; GFX7-NEXT: s_mul_i32 s28, s3, s9 -; GFX7-NEXT: s_cselect_b32 s27, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s28, s24 +; GFX7-NEXT: s_cselect_b32 s23, 1, 0 +; GFX7-NEXT: s_add_u32 s28, s28, s24 ; GFX7-NEXT: v_readfirstlane_b32 s30, v6 ; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX7-NEXT: s_addc_u32 s25, s29, s25 +; GFX7-NEXT: s_addc_u32 s27, s29, s27 ; GFX7-NEXT: s_mul_i32 s29, s4, s8 -; GFX7-NEXT: s_cselect_b32 s28, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s29, s24 +; GFX7-NEXT: s_cselect_b32 s24, 1, 0 +; GFX7-NEXT: s_add_u32 s28, s29, s28 ; GFX7-NEXT: v_readfirstlane_b32 s33, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX7-NEXT: s_addc_u32 s25, s30, s25 +; GFX7-NEXT: s_addc_u32 s27, s30, s27 ; GFX7-NEXT: s_mul_i32 s30, s16, s11 ; GFX7-NEXT: s_cselect_b32 s29, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s31, v6 ; GFX7-NEXT: s_add_u32 s19, s30, s19 -; GFX7-NEXT: s_addc_u32 s24, s31, s24 +; GFX7-NEXT: s_addc_u32 s28, s31, s28 ; GFX7-NEXT: s_mul_i32 s31, s1, s10 ; GFX7-NEXT: s_cselect_b32 s30, 1, 0 ; GFX7-NEXT: s_add_u32 s19, s31, s19 ; GFX7-NEXT: v_readfirstlane_b32 s34, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s8 -; GFX7-NEXT: s_addc_u32 s24, s33, s24 +; GFX7-NEXT: s_addc_u32 s28, s33, s28 ; GFX7-NEXT: s_mul_i32 s33, s2, s9 ; GFX7-NEXT: s_cselect_b32 s31, 1, 0 ; GFX7-NEXT: s_add_u32 s19, s33, s19 -; GFX7-NEXT: s_addc_u32 s24, s34, s24 +; GFX7-NEXT: s_addc_u32 s28, s34, s28 ; GFX7-NEXT: s_mul_i32 s34, s3, s8 ; GFX7-NEXT: s_cselect_b32 s33, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: s_add_u32 s19, s34, s19 ; GFX7-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-NEXT: s_addc_u32 s24, s35, s24 +; GFX7-NEXT: s_addc_u32 s28, s35, s28 ; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX7-NEXT: s_cselect_b32 s34, 1, 0 -; GFX7-NEXT: s_cmp_lg_u32 s23, 0 -; GFX7-NEXT: s_addc_u32 s19, s22, s19 +; GFX7-NEXT: s_cmp_lg_u32 s26, 0 +; GFX7-NEXT: s_addc_u32 s19, s25, s19 ; GFX7-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-NEXT: s_cselect_b32 s22, 1, 0 +; GFX7-NEXT: s_cselect_b32 s25, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 ; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX7-NEXT: s_addc_u32 s20, s20, 0 -; GFX7-NEXT: v_readfirstlane_b32 s23, v0 +; GFX7-NEXT: v_readfirstlane_b32 s26, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX7-NEXT: s_cmp_lg_u32 s22, 0 -; GFX7-NEXT: s_addc_u32 s20, s20, s24 -; GFX7-NEXT: s_mul_i32 s22, s16, s14 -; GFX7-NEXT: s_mul_i32 s24, s1, s13 +; GFX7-NEXT: s_cmp_lg_u32 s25, 0 +; GFX7-NEXT: s_addc_u32 s20, s20, s28 +; GFX7-NEXT: s_mul_i32 s25, s16, s14 +; GFX7-NEXT: s_mul_i32 s28, s1, s13 ; GFX7-NEXT: s_cselect_b32 s21, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s22, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11 -; GFX7-NEXT: s_mul_i32 s24, s2, s12 -; GFX7-NEXT: s_add_u32 s22, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_mul_i32 s28, s2, s12 +; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10 -; GFX7-NEXT: s_mul_i32 s24, s3, s11 -; GFX7-NEXT: s_add_u32 s22, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_mul_i32 s28, s3, s11 +; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: v_readfirstlane_b32 s35, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9 -; GFX7-NEXT: s_mul_i32 s24, s4, s10 -; GFX7-NEXT: s_add_u32 s22, s24, s22 +; GFX7-NEXT: s_mul_i32 s28, s4, s10 +; GFX7-NEXT: s_add_u32 s25, s28, s25 ; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 ; GFX7-NEXT: v_mov_b32_e32 v6, s6 ; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8 -; GFX7-NEXT: s_mul_i32 s24, s5, s9 -; GFX7-NEXT: s_add_u32 s22, s24, s22 +; GFX7-NEXT: s_mul_i32 s28, s5, s9 +; GFX7-NEXT: s_add_u32 s25, s28, s25 ; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX7-NEXT: v_readfirstlane_b32 s36, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 -; GFX7-NEXT: s_mul_i32 s24, s6, s8 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_mul_i32 s28, s6, s8 ; GFX7-NEXT: v_readfirstlane_b32 s35, v6 -; GFX7-NEXT: s_add_u32 s22, s24, s22 -; GFX7-NEXT: s_addc_u32 s23, s35, s23 -; GFX7-NEXT: s_mul_i32 s24, s16, s13 +; GFX7-NEXT: s_add_u32 s25, s28, s25 +; GFX7-NEXT: s_addc_u32 s26, s35, s26 +; GFX7-NEXT: s_mul_i32 s28, s16, s13 ; GFX7-NEXT: v_readfirstlane_b32 s35, v2 -; GFX7-NEXT: s_add_u32 s24, s24, s25 +; GFX7-NEXT: s_add_u32 s27, s28, s27 ; GFX7-NEXT: v_readfirstlane_b32 s37, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX7-NEXT: s_addc_u32 s22, s35, s22 +; GFX7-NEXT: s_addc_u32 s25, s35, s25 ; GFX7-NEXT: s_mul_i32 s35, s1, s12 -; GFX7-NEXT: s_cselect_b32 s25, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s35, s24 -; GFX7-NEXT: s_addc_u32 s22, s36, s22 +; GFX7-NEXT: s_cselect_b32 s28, 1, 0 +; GFX7-NEXT: s_add_u32 s27, s35, s27 +; GFX7-NEXT: s_addc_u32 s25, s36, s25 ; GFX7-NEXT: s_mul_i32 s36, s2, s11 ; GFX7-NEXT: s_cselect_b32 s35, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s36, s24 +; GFX7-NEXT: s_add_u32 s27, s36, s27 ; GFX7-NEXT: v_readfirstlane_b32 s38, v1 ; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX7-NEXT: s_addc_u32 s22, s37, s22 +; GFX7-NEXT: s_addc_u32 s25, s37, s25 ; GFX7-NEXT: s_mul_i32 s37, s3, s10 ; GFX7-NEXT: s_cselect_b32 s36, 1, 0 -; GFX7-NEXT: s_add_u32 s24, s37, s24 +; GFX7-NEXT: s_add_u32 s27, s37, s27 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX7-NEXT: s_addc_u32 s22, s38, s22 +; GFX7-NEXT: s_addc_u32 s25, s38, s25 ; GFX7-NEXT: s_mul_i32 s38, s4, s9 ; GFX7-NEXT: s_cselect_b32 s37, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s39, v1 -; GFX7-NEXT: s_add_u32 s24, s38, s24 -; GFX7-NEXT: s_addc_u32 s22, s39, s22 +; GFX7-NEXT: s_add_u32 s27, s38, s27 +; GFX7-NEXT: s_addc_u32 s25, s39, s25 ; GFX7-NEXT: s_mul_i32 s39, s5, s8 ; GFX7-NEXT: s_cselect_b32 s38, 1, 0 ; GFX7-NEXT: v_readfirstlane_b32 s40, v0 -; GFX7-NEXT: s_add_u32 s24, s39, s24 -; GFX7-NEXT: s_addc_u32 s22, s40, s22 +; GFX7-NEXT: s_add_u32 s27, s39, s27 +; GFX7-NEXT: s_addc_u32 s25, s40, s25 ; GFX7-NEXT: s_cselect_b32 s39, 1, 0 ; GFX7-NEXT: s_cmp_lg_u32 s31, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 @@ -1005,18 +1005,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s34, 0 ; GFX7-NEXT: s_addc_u32 s30, s30, 0 ; GFX7-NEXT: s_cmp_lg_u32 s21, 0 -; GFX7-NEXT: s_addc_u32 s21, s30, s24 -; GFX7-NEXT: s_cselect_b32 s24, 1, 0 -; GFX7-NEXT: s_cmp_lg_u32 s27, 0 -; GFX7-NEXT: s_addc_u32 s26, s26, 0 -; GFX7-NEXT: s_cmp_lg_u32 s28, 0 -; GFX7-NEXT: s_addc_u32 s26, s26, 0 -; GFX7-NEXT: s_cmp_lg_u32 s29, 0 -; GFX7-NEXT: s_addc_u32 s26, s26, 0 +; GFX7-NEXT: s_addc_u32 s21, s30, s27 +; GFX7-NEXT: s_cselect_b32 s27, 1, 0 +; GFX7-NEXT: s_cmp_lg_u32 s23, 0 +; GFX7-NEXT: s_addc_u32 s22, s22, 0 ; GFX7-NEXT: s_cmp_lg_u32 s24, 0 -; GFX7-NEXT: s_addc_u32 s22, s26, s22 +; GFX7-NEXT: s_addc_u32 s22, s22, 0 +; GFX7-NEXT: s_cmp_lg_u32 s29, 0 +; GFX7-NEXT: s_addc_u32 s22, s22, 0 +; GFX7-NEXT: s_cmp_lg_u32 s27, 0 +; GFX7-NEXT: s_addc_u32 s22, s22, s25 ; GFX7-NEXT: s_mul_i32 s16, s16, s15 -; GFX7-NEXT: s_addc_u32 s15, s23, s16 +; GFX7-NEXT: s_addc_u32 s15, s26, s16 ; GFX7-NEXT: s_mul_i32 s1, s1, s14 ; GFX7-NEXT: s_cmp_lg_u32 s39, 0 ; GFX7-NEXT: s_addc_u32 s1, s15, s1 @@ -1033,13 +1033,13 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX7-NEXT: s_cmp_lg_u32 s35, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s5 ; GFX7-NEXT: s_mul_i32 s6, s6, s9 -; GFX7-NEXT: s_cmp_lg_u32 s25, 0 +; GFX7-NEXT: s_cmp_lg_u32 s28, 0 ; GFX7-NEXT: s_addc_u32 s1, s1, s6 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s0, s0, s8 ; GFX7-NEXT: s_add_u32 s7, s7, s1 -; GFX7-NEXT: s_mov_b32 s1, s17 -; GFX7-NEXT: s_mov_b32 s2, s18 +; GFX7-NEXT: s_mov_b32 s1, s18 +; GFX7-NEXT: s_mov_b32 s2, s17 ; GFX7-NEXT: s_mov_b32 s3, s19 ; GFX7-NEXT: s_mov_b32 s4, s20 ; GFX7-NEXT: s_mov_b32 s5, s21 @@ -1059,165 +1059,165 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: v_readfirstlane_b32 s21, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_readfirstlane_b32 s23, v1 -; GFX8-NEXT: v_readfirstlane_b32 s19, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, s8 -; GFX8-NEXT: v_mov_b32_e32 v4, s11 ; GFX8-NEXT: s_mul_i32 s18, s16, s10 -; GFX8-NEXT: v_readfirstlane_b32 s24, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: v_readfirstlane_b32 s22, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX8-NEXT: s_mul_i32 s20, s1, s9 -; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4 +; GFX8-NEXT: v_readfirstlane_b32 s19, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_add_u32 s18, s20, s18 -; GFX8-NEXT: v_readfirstlane_b32 s25, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10 ; GFX8-NEXT: s_addc_u32 s19, s21, s19 ; GFX8-NEXT: s_mul_i32 s21, s2, s8 +; GFX8-NEXT: v_readfirstlane_b32 s23, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: v_readfirstlane_b32 s22, v3 ; GFX8-NEXT: s_add_u32 s18, s21, s18 -; GFX8-NEXT: v_readfirstlane_b32 s28, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s19, s22, s19 ; GFX8-NEXT: s_mul_i32 s22, s16, s9 -; GFX8-NEXT: v_readfirstlane_b32 s27, v5 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: s_add_u32 s17, s22, s17 -; GFX8-NEXT: s_addc_u32 s18, s23, s18 -; GFX8-NEXT: s_mul_i32 s23, s1, s8 -; GFX8-NEXT: s_cselect_b32 s22, 1, 0 -; GFX8-NEXT: s_add_u32 s17, s23, s17 -; GFX8-NEXT: s_addc_u32 s18, s24, s18 -; GFX8-NEXT: s_mul_i32 s24, s16, s12 -; GFX8-NEXT: s_mul_i32 s26, s1, s11 +; GFX8-NEXT: s_addc_u32 s22, s23, s18 +; GFX8-NEXT: v_readfirstlane_b32 s23, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 +; GFX8-NEXT: s_mul_i32 s18, s1, s8 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 +; GFX8-NEXT: s_add_u32 s18, s18, s17 +; GFX8-NEXT: s_addc_u32 s17, s23, s22 +; GFX8-NEXT: v_mov_b32_e32 v4, s11 +; GFX8-NEXT: v_readfirstlane_b32 s23, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10 +; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4 +; GFX8-NEXT: s_mul_i32 s22, s16, s12 +; GFX8-NEXT: s_mul_i32 s24, s1, s11 +; GFX8-NEXT: v_readfirstlane_b32 s28, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_readfirstlane_b32 s27, v5 +; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 +; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_add_u32 s24, s24, s22 +; GFX8-NEXT: s_addc_u32 s23, s27, s23 ; GFX8-NEXT: v_readfirstlane_b32 s29, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s26, s24 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8 -; GFX8-NEXT: s_addc_u32 s25, s27, s25 ; GFX8-NEXT: s_mul_i32 s27, s2, s10 -; GFX8-NEXT: s_cselect_b32 s26, 1, 0 +; GFX8-NEXT: s_cselect_b32 s22, 1, 0 ; GFX8-NEXT: s_add_u32 s24, s27, s24 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10 -; GFX8-NEXT: s_addc_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s27, s28, s23 ; GFX8-NEXT: s_mul_i32 s28, s3, s9 -; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s28, s24 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 +; GFX8-NEXT: s_add_u32 s28, s28, s24 ; GFX8-NEXT: v_readfirstlane_b32 s30, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4 -; GFX8-NEXT: s_addc_u32 s25, s29, s25 +; GFX8-NEXT: s_addc_u32 s27, s29, s27 ; GFX8-NEXT: s_mul_i32 s29, s4, s8 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s29, s24 +; GFX8-NEXT: s_cselect_b32 s24, 1, 0 +; GFX8-NEXT: s_add_u32 s28, s29, s28 ; GFX8-NEXT: v_readfirstlane_b32 s33, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9 -; GFX8-NEXT: s_addc_u32 s25, s30, s25 +; GFX8-NEXT: s_addc_u32 s27, s30, s27 ; GFX8-NEXT: s_mul_i32 s30, s16, s11 ; GFX8-NEXT: s_cselect_b32 s29, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s31, v6 ; GFX8-NEXT: s_add_u32 s19, s30, s19 -; GFX8-NEXT: s_addc_u32 s24, s31, s24 +; GFX8-NEXT: s_addc_u32 s28, s31, s28 ; GFX8-NEXT: s_mul_i32 s31, s1, s10 ; GFX8-NEXT: s_cselect_b32 s30, 1, 0 ; GFX8-NEXT: s_add_u32 s19, s31, s19 ; GFX8-NEXT: v_readfirstlane_b32 s34, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s8 -; GFX8-NEXT: s_addc_u32 s24, s33, s24 +; GFX8-NEXT: s_addc_u32 s28, s33, s28 ; GFX8-NEXT: s_mul_i32 s33, s2, s9 ; GFX8-NEXT: s_cselect_b32 s31, 1, 0 ; GFX8-NEXT: s_add_u32 s19, s33, s19 -; GFX8-NEXT: s_addc_u32 s24, s34, s24 +; GFX8-NEXT: s_addc_u32 s28, s34, s28 ; GFX8-NEXT: s_mul_i32 s34, s3, s8 ; GFX8-NEXT: s_cselect_b32 s33, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: s_add_u32 s19, s34, s19 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: s_addc_u32 s24, s35, s24 +; GFX8-NEXT: s_addc_u32 s28, s35, s28 ; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 ; GFX8-NEXT: s_cselect_b32 s34, 1, 0 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_addc_u32 s19, s22, s19 +; GFX8-NEXT: s_cmp_lg_u32 s26, 0 +; GFX8-NEXT: s_addc_u32 s19, s25, s19 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 -; GFX8-NEXT: s_cselect_b32 s22, 1, 0 +; GFX8-NEXT: s_cselect_b32 s25, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 ; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 ; GFX8-NEXT: s_addc_u32 s20, s20, 0 -; GFX8-NEXT: v_readfirstlane_b32 s23, v0 +; GFX8-NEXT: v_readfirstlane_b32 s26, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1 -; GFX8-NEXT: s_cmp_lg_u32 s22, 0 -; GFX8-NEXT: s_addc_u32 s20, s20, s24 -; GFX8-NEXT: s_mul_i32 s22, s16, s14 -; GFX8-NEXT: s_mul_i32 s24, s1, s13 +; GFX8-NEXT: s_cmp_lg_u32 s25, 0 +; GFX8-NEXT: s_addc_u32 s20, s20, s28 +; GFX8-NEXT: s_mul_i32 s25, s16, s14 +; GFX8-NEXT: s_mul_i32 s28, s1, s13 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s22, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11 -; GFX8-NEXT: s_mul_i32 s24, s2, s12 -; GFX8-NEXT: s_add_u32 s22, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_mul_i32 s28, s2, s12 +; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10 -; GFX8-NEXT: s_mul_i32 s24, s3, s11 -; GFX8-NEXT: s_add_u32 s22, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_mul_i32 s28, s3, s11 +; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: v_readfirstlane_b32 s35, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9 -; GFX8-NEXT: s_mul_i32 s24, s4, s10 -; GFX8-NEXT: s_add_u32 s22, s24, s22 +; GFX8-NEXT: s_mul_i32 s28, s4, s10 +; GFX8-NEXT: s_add_u32 s25, s28, s25 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8 -; GFX8-NEXT: s_mul_i32 s24, s5, s9 -; GFX8-NEXT: s_add_u32 s22, s24, s22 +; GFX8-NEXT: s_mul_i32 s28, s5, s9 +; GFX8-NEXT: s_add_u32 s25, s28, s25 ; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2 ; GFX8-NEXT: v_readfirstlane_b32 s36, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 -; GFX8-NEXT: s_mul_i32 s24, s6, s8 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_mul_i32 s28, s6, s8 ; GFX8-NEXT: v_readfirstlane_b32 s35, v6 -; GFX8-NEXT: s_add_u32 s22, s24, s22 -; GFX8-NEXT: s_addc_u32 s23, s35, s23 -; GFX8-NEXT: s_mul_i32 s24, s16, s13 +; GFX8-NEXT: s_add_u32 s25, s28, s25 +; GFX8-NEXT: s_addc_u32 s26, s35, s26 +; GFX8-NEXT: s_mul_i32 s28, s16, s13 ; GFX8-NEXT: v_readfirstlane_b32 s35, v2 -; GFX8-NEXT: s_add_u32 s24, s24, s25 +; GFX8-NEXT: s_add_u32 s27, s28, s27 ; GFX8-NEXT: v_readfirstlane_b32 s37, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10 -; GFX8-NEXT: s_addc_u32 s22, s35, s22 +; GFX8-NEXT: s_addc_u32 s25, s35, s25 ; GFX8-NEXT: s_mul_i32 s35, s1, s12 -; GFX8-NEXT: s_cselect_b32 s25, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s35, s24 -; GFX8-NEXT: s_addc_u32 s22, s36, s22 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_add_u32 s27, s35, s27 +; GFX8-NEXT: s_addc_u32 s25, s36, s25 ; GFX8-NEXT: s_mul_i32 s36, s2, s11 ; GFX8-NEXT: s_cselect_b32 s35, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s36, s24 +; GFX8-NEXT: s_add_u32 s27, s36, s27 ; GFX8-NEXT: v_readfirstlane_b32 s38, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9 -; GFX8-NEXT: s_addc_u32 s22, s37, s22 +; GFX8-NEXT: s_addc_u32 s25, s37, s25 ; GFX8-NEXT: s_mul_i32 s37, s3, s10 ; GFX8-NEXT: s_cselect_b32 s36, 1, 0 -; GFX8-NEXT: s_add_u32 s24, s37, s24 +; GFX8-NEXT: s_add_u32 s27, s37, s27 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8 -; GFX8-NEXT: s_addc_u32 s22, s38, s22 +; GFX8-NEXT: s_addc_u32 s25, s38, s25 ; GFX8-NEXT: s_mul_i32 s38, s4, s9 ; GFX8-NEXT: s_cselect_b32 s37, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s39, v1 -; GFX8-NEXT: s_add_u32 s24, s38, s24 -; GFX8-NEXT: s_addc_u32 s22, s39, s22 +; GFX8-NEXT: s_add_u32 s27, s38, s27 +; GFX8-NEXT: s_addc_u32 s25, s39, s25 ; GFX8-NEXT: s_mul_i32 s39, s5, s8 ; GFX8-NEXT: s_cselect_b32 s38, 1, 0 ; GFX8-NEXT: v_readfirstlane_b32 s40, v0 -; GFX8-NEXT: s_add_u32 s24, s39, s24 -; GFX8-NEXT: s_addc_u32 s22, s40, s22 +; GFX8-NEXT: s_add_u32 s27, s39, s27 +; GFX8-NEXT: s_addc_u32 s25, s40, s25 ; GFX8-NEXT: s_cselect_b32 s39, 1, 0 ; GFX8-NEXT: s_cmp_lg_u32 s31, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 @@ -1226,18 +1226,18 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s34, 0 ; GFX8-NEXT: s_addc_u32 s30, s30, 0 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_addc_u32 s21, s30, s24 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_addc_u32 s26, s26, 0 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_addc_u32 s26, s26, 0 -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 -; GFX8-NEXT: s_addc_u32 s26, s26, 0 +; GFX8-NEXT: s_addc_u32 s21, s30, s27 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_cmp_lg_u32 s23, 0 +; GFX8-NEXT: s_addc_u32 s22, s22, 0 ; GFX8-NEXT: s_cmp_lg_u32 s24, 0 -; GFX8-NEXT: s_addc_u32 s22, s26, s22 +; GFX8-NEXT: s_addc_u32 s22, s22, 0 +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_addc_u32 s22, s22, 0 +; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_addc_u32 s22, s22, s25 ; GFX8-NEXT: s_mul_i32 s16, s16, s15 -; GFX8-NEXT: s_addc_u32 s15, s23, s16 +; GFX8-NEXT: s_addc_u32 s15, s26, s16 ; GFX8-NEXT: s_mul_i32 s1, s1, s14 ; GFX8-NEXT: s_cmp_lg_u32 s39, 0 ; GFX8-NEXT: s_addc_u32 s1, s15, s1 @@ -1254,13 +1254,13 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX8-NEXT: s_cmp_lg_u32 s35, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: s_mul_i32 s6, s6, s9 -; GFX8-NEXT: s_cmp_lg_u32 s25, 0 +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 ; GFX8-NEXT: s_addc_u32 s1, s1, s6 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s0, s0, s8 ; GFX8-NEXT: s_add_u32 s7, s7, s1 -; GFX8-NEXT: s_mov_b32 s1, s17 -; GFX8-NEXT: s_mov_b32 s2, s18 +; GFX8-NEXT: s_mov_b32 s1, s18 +; GFX8-NEXT: s_mov_b32 s2, s17 ; GFX8-NEXT: s_mov_b32 s3, s19 ; GFX8-NEXT: s_mov_b32 s4, s20 ; GFX8-NEXT: s_mov_b32 s5, s21 @@ -1269,9 +1269,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; ; GFX9-LABEL: s_mul_i256: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mul_i32 s18, s0, s10 +; GFX9-NEXT: s_mov_b32 s16, s0 +; GFX9-NEXT: s_mul_i32 s18, s16, s10 ; GFX9-NEXT: s_mul_i32 s20, s1, s9 -; GFX9-NEXT: s_mul_hi_u32 s19, s0, s10 +; GFX9-NEXT: s_mul_hi_u32 s19, s16, s10 ; GFX9-NEXT: s_mul_hi_u32 s21, s1, s9 ; GFX9-NEXT: s_add_u32 s18, s20, s18 ; GFX9-NEXT: s_addc_u32 s19, s21, s19 @@ -1279,11 +1280,11 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_mul_hi_u32 s22, s2, s8 ; GFX9-NEXT: s_add_u32 s18, s21, s18 -; GFX9-NEXT: s_mul_hi_u32 s17, s0, s8 +; GFX9-NEXT: s_mul_hi_u32 s17, s16, s8 ; GFX9-NEXT: s_addc_u32 s19, s22, s19 -; GFX9-NEXT: s_mul_i32 s22, s0, s9 +; GFX9-NEXT: s_mul_i32 s22, s16, s9 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9 +; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9 ; GFX9-NEXT: s_add_u32 s17, s22, s17 ; GFX9-NEXT: s_addc_u32 s18, s23, s18 ; GFX9-NEXT: s_mul_i32 s23, s1, s8 @@ -1291,10 +1292,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8 ; GFX9-NEXT: s_add_u32 s17, s23, s17 ; GFX9-NEXT: s_addc_u32 s18, s24, s18 -; GFX9-NEXT: s_mul_i32 s24, s0, s12 +; GFX9-NEXT: s_mul_i32 s24, s16, s12 ; GFX9-NEXT: s_mul_i32 s26, s1, s11 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s25, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12 ; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11 ; GFX9-NEXT: s_add_u32 s24, s26, s24 ; GFX9-NEXT: s_addc_u32 s25, s27, s25 @@ -1313,9 +1314,9 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s30, s4, s8 ; GFX9-NEXT: s_add_u32 s24, s29, s24 ; GFX9-NEXT: s_addc_u32 s25, s30, s25 -; GFX9-NEXT: s_mul_i32 s30, s0, s11 +; GFX9-NEXT: s_mul_i32 s30, s16, s11 ; GFX9-NEXT: s_cselect_b32 s29, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s31, s0, s11 +; GFX9-NEXT: s_mul_hi_u32 s31, s16, s11 ; GFX9-NEXT: s_add_u32 s19, s30, s19 ; GFX9-NEXT: s_addc_u32 s24, s31, s24 ; GFX9-NEXT: s_mul_i32 s31, s1, s10 @@ -1341,10 +1342,10 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_addc_u32 s20, s20, 0 ; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_addc_u32 s20, s20, s24 -; GFX9-NEXT: s_mul_i32 s22, s0, s14 +; GFX9-NEXT: s_mul_i32 s22, s16, s14 ; GFX9-NEXT: s_mul_i32 s24, s1, s13 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_mul_hi_u32 s23, s0, s14 +; GFX9-NEXT: s_mul_hi_u32 s23, s16, s14 ; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 @@ -1368,8 +1369,8 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_mul_hi_u32 s35, s6, s8 ; GFX9-NEXT: s_add_u32 s22, s24, s22 ; GFX9-NEXT: s_addc_u32 s23, s35, s23 -; GFX9-NEXT: s_mul_i32 s24, s0, s13 -; GFX9-NEXT: s_mul_hi_u32 s35, s0, s13 +; GFX9-NEXT: s_mul_i32 s24, s16, s13 +; GFX9-NEXT: s_mul_hi_u32 s35, s16, s13 ; GFX9-NEXT: s_add_u32 s24, s24, s25 ; GFX9-NEXT: s_addc_u32 s22, s35, s22 ; GFX9-NEXT: s_mul_i32 s35, s1, s12 @@ -1414,31 +1415,30 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_cmp_lg_u32 s29, 0 ; GFX9-NEXT: s_addc_u32 s26, s26, 0 ; GFX9-NEXT: s_cmp_lg_u32 s24, 0 -; GFX9-NEXT: s_mul_i32 s16, s0, s8 ; GFX9-NEXT: s_addc_u32 s22, s26, s22 -; GFX9-NEXT: s_mul_i32 s0, s0, s15 -; GFX9-NEXT: s_addc_u32 s0, s23, s0 +; GFX9-NEXT: s_mul_i32 s16, s16, s15 +; GFX9-NEXT: s_addc_u32 s15, s23, s16 ; GFX9-NEXT: s_mul_i32 s1, s1, s14 ; GFX9-NEXT: s_cmp_lg_u32 s39, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s1 +; GFX9-NEXT: s_addc_u32 s1, s15, s1 ; GFX9-NEXT: s_mul_i32 s2, s2, s13 ; GFX9-NEXT: s_cmp_lg_u32 s38, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s2 ; GFX9-NEXT: s_mul_i32 s3, s3, s12 ; GFX9-NEXT: s_cmp_lg_u32 s37, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s3 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: s_mul_i32 s4, s4, s11 ; GFX9-NEXT: s_cmp_lg_u32 s36, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s4 ; GFX9-NEXT: s_mul_i32 s5, s5, s10 ; GFX9-NEXT: s_cmp_lg_u32 s35, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s5 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: s_mul_i32 s6, s6, s9 ; GFX9-NEXT: s_cmp_lg_u32 s25, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s6 +; GFX9-NEXT: s_addc_u32 s1, s1, s6 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 -; GFX9-NEXT: s_add_u32 s7, s7, s0 -; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: s_mul_i32 s0, s0, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s1 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s3, s19 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index ff294d8378005..4248f7b6a1583 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -26,131 +26,133 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v6, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v7, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v7 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v6 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v7, vcc -; CHECK-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; CHECK-NEXT: v_trunc_f32_e32 v3, v2 -; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v1 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v8, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v2 -; CHECK-NEXT: v_mul_lo_u32 v14, v11, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v2, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 +; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 +; CHECK-NEXT: v_trunc_f32_e32 v8, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 +; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7 +; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v2, vcc -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v1 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v8, v[2:3] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 -; CHECK-NEXT: v_mul_lo_u32 v10, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v9 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v12, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v8, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v11, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v10, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, v1, v3 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v10, 0 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3 +; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v7 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc +; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10 +; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7 +; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10 +; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v11, v3 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[2:3] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, v[2:3] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v4, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 -; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 -; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 +; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v5, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -378,264 +380,265 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v5, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v4, v8 -; GISEL-NEXT: v_xor_b32_e32 v4, v9, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v4, vcc -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v9 -; GISEL-NEXT: v_trunc_f32_e32 v11, v10 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v17, v9 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v4, v8 +; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v12, v9 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v16, 0 -; GISEL-NEXT: v_mov_b32_e32 v9, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v15, v[9:10] +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] +; GISEL-NEXT: v_mov_b32_e32 v5, v12 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v15, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v11 -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v5, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 +; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v5, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v11 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v13, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v15, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v13, v[10:11] -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v14, v10, vcc -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v14, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v14, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v4, v13, v[11:12] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v15, v11, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v15, v11 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v1, vcc -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v1, v10 -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, v11, v12, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc +; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v6 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v14 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v14, vcc +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v18, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 ; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 ; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v6, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v18, v[1:2] +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v15, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v20, v18, v[1:2] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v19, v[4:5] -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v21, v19, v[10:11] +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v17, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v19, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v19, v10 ; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v19, v4 +; GISEL-NEXT: v_mul_lo_u32 v16, v18, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v12, v[4:5] -; GISEL-NEXT: v_xor_b32_e32 v1, v11, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v4 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v20, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc +; GISEL-NEXT: v_ashrrev_i32_e32 v14, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; GISEL-NEXT: v_xor_b32_e32 v1, v4, v13 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v14 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v10, v8 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v14 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v16, v4 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v16, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v12, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v11, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v4, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v11, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v8, v12, v13 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v13 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v10, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v8, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v9, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -659,128 +662,128 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v5, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v5 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CGP-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; CGP-NEXT: v_trunc_f32_e32 v3, v2 -; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v3 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v15, v[2:3] -; CGP-NEXT: v_mul_hi_u32 v16, v12, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v12, v[2:3] -; CGP-NEXT: v_mul_lo_u32 v3, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v15, v1 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v18, v15, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v17 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v5, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CGP-NEXT: v_trunc_f32_e32 v5, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v17, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v18, v15, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v2 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v17, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v18, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v12, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v2, v15, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v1 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v15, v[2:3] +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] ; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v12, v[2:3] -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v3, v13 -; CGP-NEXT: v_mul_lo_u32 v3, v15, v1 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v1, v15, v1 +; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v14, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 ; CGP-NEXT: v_xor_b32_e32 v10, v10, v13 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v16, v1 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v16, v15, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_mul_hi_u32 v14, v12, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v2, v15, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v10, v1 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_mul_hi_u32 v12, v11, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v12, v11, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_add_i32_e32 v14, vcc, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v14, 0 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v3 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v12, v[2:3] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v14, v[2:3] -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v10, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v10, v2 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 -; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v5, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 -; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v10, v11, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 +; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v11, vcc +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 ; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc @@ -832,128 +835,128 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: .LBB2_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v7, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v7 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v6 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v7, vcc -; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v5, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v13, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v4 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; CGP-NEXT: v_trunc_f32_e32 v7, v6 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v16, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v15, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v13, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] ; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[4:5] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v5, v11 -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_xor_b32_e32 v9, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v13, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v3, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v10, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v12, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v8, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v8, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v7, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7 -; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 +; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 ; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v7 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 ; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc @@ -1128,173 +1131,173 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v6, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v9, v[5:6] -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_trunc_f32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v6 +; GISEL-NEXT: v_xor_b32_e32 v9, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v8 ; GISEL-NEXT: v_xor_b32_e32 v12, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v9, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0x1000 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8] ; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x1000 +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v7, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x1000 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v8, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v10, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v6 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -1302,7 +1305,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 @@ -1313,13 +1316,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 +; GISEL-NEXT: v_xor_b32_e32 v9, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1341,7 +1344,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc @@ -1367,10 +1370,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: @@ -1378,8 +1381,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: s_movk_i32 s7, 0xf000 +; CGP-NEXT: s_movk_i32 s6, 0x1000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1390,7 +1393,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -1414,11 +1417,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -1465,11 +1468,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v1, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v10, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v11, v6, vcc @@ -1495,10 +1498,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v15, v[1:2] ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v11 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc @@ -1528,11 +1531,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] ; CGP-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -1584,11 +1587,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -1758,173 +1761,173 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v6, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v6 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v9, v[5:6] -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v6, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GISEL-NEXT: v_trunc_f32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v11, v9, v7 +; GISEL-NEXT: v_mul_lo_u32 v12, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v4, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, v[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v9, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v4, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v9, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v6 +; GISEL-NEXT: v_xor_b32_e32 v9, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v0, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v8 ; GISEL-NEXT: v_xor_b32_e32 v12, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v9, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v5, 0x12d8fb -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8] ; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v12, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v7, vcc +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v10, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v6, v6 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v8, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v12 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v16, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v10, v7, vcc ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v6 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc @@ -1932,7 +1935,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v6 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 @@ -1943,13 +1946,13 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 ; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 ; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 +; GISEL-NEXT: v_xor_b32_e32 v9, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1971,7 +1974,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v9, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc @@ -1997,10 +2000,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -2008,8 +2011,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: s_mov_b32 s7, 0xffed2705 +; CGP-NEXT: s_mov_b32 s6, 0x12d8fb ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -2020,7 +2023,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -2044,11 +2047,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -2095,11 +2098,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v1, v6 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v10, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v10, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v11, v6, vcc @@ -2125,10 +2128,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v14, -1, v14, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v15, v[1:2] ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v11 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v12, vcc @@ -2158,11 +2161,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v15, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] ; CGP-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -2214,11 +2217,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v10, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -2276,131 +2279,131 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v8, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v7 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v8 -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc -; CHECK-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; CHECK-NEXT: v_trunc_f32_e32 v5, v2 -; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v5 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v12, v1 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v12, v1 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v5 -; CHECK-NEXT: v_mul_lo_u32 v14, v12, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v6, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v2, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 +; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; CHECK-NEXT: v_trunc_f32_e32 v7, v6 +; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v14, v1 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; CHECK-NEXT: v_mul_hi_u32 v5, v12, v5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v1 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v2, vcc -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v9, 0 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v10 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[5:6] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v10, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v2, v10 -; CHECK-NEXT: v_mul_lo_u32 v2, v12, v1 -; CHECK-NEXT: v_mul_lo_u32 v6, v9, v5 -; CHECK-NEXT: v_xor_b32_e32 v11, v3, v10 -; CHECK-NEXT: v_mul_hi_u32 v3, v9, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v12, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3 +; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v5, v12, v5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v1 -; CHECK-NEXT: v_mul_lo_u32 v5, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v6, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v11, v1 -; CHECK-NEXT: v_mul_hi_u32 v9, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v1, v3 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v6, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v3 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v5, v[2:3] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, v[2:3] -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v11, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v8 -; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v7 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v8 -; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v9, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v8 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -2445,89 +2448,90 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v5, v4 -; GISEL-NEXT: v_xor_b32_e32 v5, v8, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v7 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v5, v4 +; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v7 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, 0, v5, vcc -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; GISEL-NEXT: v_trunc_f32_e32 v10, v9 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v10 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v12, v11, 0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v12, v14, v[9:10] -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v8 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v11, v[9:10] -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; GISEL-NEXT: v_trunc_f32_e32 v11, v9 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v7, v15, v9 +; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v11, v8 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v12, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v8, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v12, v14, v[8:9] -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[10:11] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v0, v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v10 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v1, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v7 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[7:8] +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v14, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 ; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 ; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 @@ -2535,165 +2539,165 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v8, v15, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v7, v15, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v11, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[6:7], v8, v16, v[1:2] ; GISEL-NEXT: v_lshl_b64 v[11:12], s[4:5], v6 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v14, v[9:10] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v16, v9, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v16, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v15, v[9:10] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v14, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v9 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v13, v9, v10, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v6 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v12, v6, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v10, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v11 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v12 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v14 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v15, vcc -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v9, v10, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v12, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 +; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v9 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v15 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v11 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v8 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v18, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v18 +; GISEL-NEXT: v_trunc_f32_e32 v12, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v12 ; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, 0, v10 ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v20, v18, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v21, v19, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v17, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v12 +; GISEL-NEXT: v_subb_u32_e32 v21, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v20, v22, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v17 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v21, v19, v[11:12] +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v18, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v18, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v19, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v22, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v19, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v19, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v22, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v18, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v19, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 +; GISEL-NEXT: v_mul_lo_u32 v14, v22, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v19, v0 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v18, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v0 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v22, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v11, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v10, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v13, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v15, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v12, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v13, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v21, v11, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc ; GISEL-NEXT: v_xor_b32_e32 v5, v2, v13 -; GISEL-NEXT: v_mul_lo_u32 v2, v10, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v9, v4 +; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 ; GISEL-NEXT: v_xor_b32_e32 v15, v3, v13 -; GISEL-NEXT: v_mul_hi_u32 v3, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_mul_hi_u32 v14, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v10, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v4, v5, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GISEL-NEXT: v_mul_hi_u32 v4, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v8 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v9, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v12 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v11 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v12 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v11 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v12 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc @@ -2701,9 +2705,9 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; GISEL-NEXT: v_xor_b32_e32 v4, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2729,131 +2733,131 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v10, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v10, vcc -; CGP-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; CGP-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CGP-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; CGP-NEXT: v_trunc_f32_e32 v3, v2 -; CGP-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v14, v[2:3] -; CGP-NEXT: v_mul_hi_u32 v15, v11, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v11, v[2:3] -; CGP-NEXT: v_mul_lo_u32 v3, v14, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 -; CGP-NEXT: v_mul_lo_u32 v16, v11, v2 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v1, v0 +; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; CGP-NEXT: v_trunc_f32_e32 v10, v4 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v10 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v15, v3 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v10 +; CGP-NEXT: v_mul_lo_u32 v17, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v11, v2 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v17, v1 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v17, v3 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v15, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v1 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v14, v[2:3] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v15, v11, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v11, v[2:3] -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v3, v12 -; CGP-NEXT: v_mul_lo_u32 v3, v14, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v12 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v15, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v1 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v2 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v8, v1 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v8, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v3 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v11, v[2:3] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, v[2:3] -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v8, v2 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v10 -; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v16, v11 +; CGP-NEXT: v_mul_hi_u32 v10, v15, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v13 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v4, v13 +; CGP-NEXT: v_mul_lo_u32 v4, v15, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v15, v10 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v8, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v14, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v3, v8 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v12, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v10, v[8:9] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v14, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v10 -; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_cndmask_b32_e64 v4, v9, v11, s[4:5] +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v10 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v8 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v2, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v12, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -2904,128 +2908,130 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: .LBB8_7: ; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v10, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v10 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v6 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v10, vcc -; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CGP-NEXT: v_trunc_f32_e32 v8, v4 -; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v14, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v14, v3 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[8:9] -; CGP-NEXT: v_mul_hi_u32 v9, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v8 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v10, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 +; CGP-NEXT: v_trunc_f32_e32 v10, v8 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mov_b32_e32 v6, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v11, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v3 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v14, v[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mov_b32_e32 v6, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] ; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[8:9] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v14, v3 -; CGP-NEXT: v_mul_lo_u32 v9, v11, v8 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 ; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v11, v8 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 +; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v9, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v10 -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v10 -; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v5, v7, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v9 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 +; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] +; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v6, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 ; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -3164,7 +3170,6 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 ; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2 ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] ; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 @@ -3196,6 +3201,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 ; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 @@ -3213,163 +3219,162 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v8, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v12, v[0:1] -; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6] +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8] -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v11, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v0 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v10 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v5, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v11 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v4, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v4, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[4:5] -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, 0 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v10, vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v9 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v17, v14, v[5:6] -; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 0, v2 +; GISEL-NEXT: v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v2 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v13 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7] +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v0, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GISEL-NEXT: v_mul_hi_u32 v2, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GISEL-NEXT: v_mul_hi_u32 v1, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v15, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v3, 0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v18 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v16, v4, v[1:2] -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v19, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v3, v[1:2] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v6, v19, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v2, v4, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v3, v0 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v14, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v17 +; GISEL-NEXT: v_mov_b32_e32 v1, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2] +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v18, v14, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_mul_lo_u32 v7, v11, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 +; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v4, v1 -; GISEL-NEXT: v_mul_hi_u32 v0, v4, v0 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v1 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v10, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v4, v1 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v2 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v3, v0 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v4, v1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v1 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v1 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v7, 0 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 +; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, v[0:1] -; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v7, v[3:4] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v6, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1] +; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v9, v5 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4 +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2 +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 6c7e4fe3f01c7..4c444f46ff3dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1281,22 +1281,22 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s4, s13, 31 -; GFX8-NEXT: s_ashr_i32 s16, s1, 31 -; GFX8-NEXT: s_add_u32 s12, s12, s4 -; GFX8-NEXT: s_addc_u32 s13, s13, s4 -; GFX8-NEXT: s_add_u32 s0, s0, s16 -; GFX8-NEXT: s_mov_b32 s17, s16 -; GFX8-NEXT: s_addc_u32 s1, s1, s16 -; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX8-NEXT: s_ashr_i32 s6, s1, 31 +; GFX8-NEXT: s_add_u32 s16, s12, s4 +; GFX8-NEXT: s_addc_u32 s17, s13, s4 +; GFX8-NEXT: s_add_u32 s0, s0, s6 +; GFX8-NEXT: s_mov_b32 s7, s6 +; GFX8-NEXT: s_addc_u32 s1, s1, s6 +; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12 ; GFX8-NEXT: s_mov_b32 s5, s4 -; GFX8-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] +; GFX8-NEXT: s_xor_b64 s[16:17], s[16:17], s[4:5] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s18, 0, s6 -; GFX8-NEXT: s_subb_u32 s19, 0, s7 +; GFX8-NEXT: s_sub_u32 s18, 0, s12 +; GFX8-NEXT: s_subb_u32 s19, 0, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 @@ -1337,9 +1337,9 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] -; GFX8-NEXT: s_ashr_i32 s16, s3, 31 -; GFX8-NEXT: s_mov_b32 s17, s16 +; GFX8-NEXT: s_xor_b64 s[18:19], s[4:5], s[6:7] +; GFX8-NEXT: s_ashr_i32 s6, s15, 31 +; GFX8-NEXT: s_mov_b32 s7, s6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 @@ -1359,64 +1359,65 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, s17, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s16, v1 +; GFX8-NEXT: v_mul_hi_u32 v4, s16, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX8-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX8-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s13 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_ashr_i32 s12, s15, 31 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v3, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v6, s17 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s16, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: s_ashr_i32 s16, s3, 31 ; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s13, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s17, v1 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s6, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s12, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] -; GFX8-NEXT: s_add_u32 s0, s14, s12 -; GFX8-NEXT: s_addc_u32 s1, s15, s12 +; GFX8-NEXT: s_add_u32 s0, s14, s6 +; GFX8-NEXT: s_addc_u32 s1, s15, s6 ; GFX8-NEXT: s_add_u32 s2, s2, s16 +; GFX8-NEXT: s_mov_b32 s17, s16 ; GFX8-NEXT: s_addc_u32 s3, s3, s16 ; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 ; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s6, v8 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s12, v8 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 @@ -1430,8 +1431,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v12 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GFX8-NEXT: s_mov_b32 s13, s12 -; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] ; GFX8-NEXT: s_sub_u32 s5, 0, s2 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v13, 0 @@ -1504,37 +1504,37 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s7, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s6, v3 +; GFX8-NEXT: v_mul_lo_u32 v7, s13, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s12, v3 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s6, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, s12, v2 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s7, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s13, v3 +; GFX8-NEXT: v_mul_hi_u32 v2, s13, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s6, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s12, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s7, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s13, v3 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s7 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s13 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v2 ; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s7, v6 +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s13, v6 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 @@ -1567,16 +1567,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[16:17] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v8 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v9 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s12, v6 -; GFX8-NEXT: v_xor_b32_e32 v7, s12, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX8-NEXT: v_mov_b32_e32 v8, s6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_mov_b32_e32 v9, s9 @@ -1593,22 +1593,22 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s13, 31 -; GFX9-NEXT: s_ashr_i32 s16, s1, 31 -; GFX9-NEXT: s_add_u32 s12, s12, s4 -; GFX9-NEXT: s_addc_u32 s13, s13, s4 -; GFX9-NEXT: s_add_u32 s0, s0, s16 -; GFX9-NEXT: s_mov_b32 s17, s16 -; GFX9-NEXT: s_addc_u32 s1, s1, s16 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[16:17] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_u32 s16, s12, s4 +; GFX9-NEXT: s_addc_u32 s17, s13, s4 +; GFX9-NEXT: s_add_u32 s0, s0, s6 +; GFX9-NEXT: s_mov_b32 s7, s6 +; GFX9-NEXT: s_addc_u32 s1, s1, s6 +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] +; GFX9-NEXT: s_xor_b64 s[16:17], s[16:17], s[4:5] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s18, 0, s6 -; GFX9-NEXT: s_subb_u32 s19, 0, s7 +; GFX9-NEXT: s_sub_u32 s18, 0, s12 +; GFX9-NEXT: s_subb_u32 s19, 0, s13 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 @@ -1642,16 +1642,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v3, v[1:2] ; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: s_xor_b64 s[18:19], s[4:5], s[16:17] -; GFX9-NEXT: s_ashr_i32 s16, s3, 31 -; GFX9-NEXT: s_mov_b32 s17, s16 +; GFX9-NEXT: s_xor_b64 s[18:19], s[4:5], s[6:7] +; GFX9-NEXT: s_ashr_i32 s6, s15, 31 +; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 @@ -1670,64 +1670,65 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v5, 0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add3_u32 v4, v3, v0, v6 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v4, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s12, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s7, v5, v[2:3] -; GFX9-NEXT: s_ashr_i32 s12, s15, 31 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v4, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s17 +; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s16, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v5, v[2:3] +; GFX9-NEXT: s_ashr_i32 s16, s3, 31 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v1, s13, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v6 +; GFX9-NEXT: v_sub_u32_e32 v1, s17, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s6, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s12, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[0:1] ; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v10 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v9 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s14, s12 -; GFX9-NEXT: s_addc_u32 s1, s15, s12 +; GFX9-NEXT: s_add_u32 s0, s14, s6 +; GFX9-NEXT: s_addc_u32 s1, s15, s6 ; GFX9-NEXT: s_add_u32 s2, s2, s16 +; GFX9-NEXT: s_mov_b32 s17, s16 ; GFX9-NEXT: s_addc_u32 s3, s3, s16 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s6, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v16, vcc, s12, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v15 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 @@ -1741,29 +1742,28 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v13 ; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GFX9-NEXT: s_mov_b32 s13, s12 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_subb_u32 s14, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v13, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc -; GFX9-NEXT: v_mul_hi_u32 v11, v14, v1 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v3, v13, v1 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 +; GFX9-NEXT: v_mul_hi_u32 v11, v14, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, v14, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 ; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] ; GFX9-NEXT: v_mul_lo_u32 v11, v13, v2 +; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v14, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, v13, v2 @@ -1812,18 +1812,18 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add3_u32 v4, v8, v7, v4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v11, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v12, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s7, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s6, v4 -; GFX9-NEXT: v_mul_hi_u32 v10, s6, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_mul_hi_u32 v12, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s13, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s12, v4 +; GFX9-NEXT: v_mul_hi_u32 v10, s12, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s13, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s13, v4 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, s13, v4 ; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, s12, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, s4 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -1837,13 +1837,13 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc ; GFX9-NEXT: v_add3_u32 v9, v8, v7, v12 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX9-NEXT: v_mov_b32_e32 v10, s13 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s12, v3 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8] ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 -; GFX9-NEXT: v_sub_u32_e32 v7, s7, v7 +; GFX9-NEXT: v_sub_u32_e32 v7, s13, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] @@ -1875,16 +1875,16 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17] +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[16:17] ; GFX9-NEXT: v_xor_b32_e32 v3, s0, v10 ; GFX9-NEXT: v_xor_b32_e32 v4, s1, v9 ; GFX9-NEXT: v_mov_b32_e32 v9, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v9, vcc -; GFX9-NEXT: v_xor_b32_e32 v7, s12, v7 -; GFX9-NEXT: v_xor_b32_e32 v8, s12, v8 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s12, v7 +; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX9-NEXT: v_xor_b32_e32 v8, s6, v8 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v8, v9, vcc ; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[8:9] ; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 21fd7b594aca4..d0c55c69f5087 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -24,135 +24,135 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v6, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 -; CHECK-NEXT: v_mul_lo_u32 v12, v7, v1 -; CHECK-NEXT: v_mul_lo_u32 v13, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v2, v1 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v6, v3 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v0 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] -; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v5 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v8 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v8, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 -; CHECK-NEXT: v_mul_lo_u32 v9, v7, v1 -; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v11, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, v7, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v10, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v9, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v9, v[1:2] -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v4, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v4, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v6 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v8 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -476,159 +476,159 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8 -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v8, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v1, v9, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v0 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v14, v6, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v13 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v14 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v10, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v8 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v17, v6, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v6, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v19, s[4:5], 0, v13 -; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v21, v6 -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v12, v8, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v18, v0 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v19, v21, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v20, v18, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v21, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v18, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v21, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_trunc_f32_e32 v16, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v7, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v9, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v16, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v17, v[8:9] +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v20, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v17, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v21, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_mul_hi_u32 v7, v18, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v21, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v16, v8 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v0 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v21, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v7, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v12, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v20, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v13, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v18, v14, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v13, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v8 +; GISEL-NEXT: v_xor_b32_e32 v12, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v11, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v8, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v8, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v11, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v11, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v14 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v14 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v13 -; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v14 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v14 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v13 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64: @@ -646,131 +646,131 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB2_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v4, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v3 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc -; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CGP-NEXT: v_trunc_f32_e32 v2, v1 -; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v0 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v14, v[1:2] -; CGP-NEXT: v_mul_hi_u32 v15, v5, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v5, v[1:2] -; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v1 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v16 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v5 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v1 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 +; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v4, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v16, v5, v3 +; CGP-NEXT: v_mul_lo_u32 v17, v14, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v16, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v17, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v3 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v0 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v14, v[1:2] +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] ; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v5, v[1:2] -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v2, v12 -; CGP-NEXT: v_mul_lo_u32 v2, v14, v0 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v14, v0 +; CGP-NEXT: v_xor_b32_e32 v11, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 ; CGP-NEXT: v_xor_b32_e32 v10, v10, v12 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v15, v14, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v13, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v1, v14, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v2, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v5, v11, v1 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v10, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v11, v3 +; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v10, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v13, 0 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_subb_u32_e64 v2, s[4:5], v10, v1, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v10, v1 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v10, vcc, v0, v3 -; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v1, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0 +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v12 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v12 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 @@ -815,131 +815,131 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] ; CGP-NEXT: .LBB2_7: -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v6, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v13, v7, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v7, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v7 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v3 +; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v6, v5 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] +; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v14, v7, v5 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v5 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v2 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] ; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v7, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v4, v10 -; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v11, v7, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 +; CGP-NEXT: v_xor_b32_e32 v9, v6, v10 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v11, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v3 +; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v9, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v9, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v11, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v8, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v5 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v6 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 @@ -1354,8 +1354,8 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s6, 0xf000 -; CGP-NEXT: s_movk_i32 s7, 0x1000 +; CGP-NEXT: s_movk_i32 s7, 0xf000 +; CGP-NEXT: s_movk_i32 s6, 0x1000 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1366,7 +1366,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -1390,11 +1390,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -1442,11 +1442,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v1, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v6, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 ; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 @@ -1469,10 +1469,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v14, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v15, -1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v12, vcc @@ -1502,12 +1502,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v14, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc ; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -1558,11 +1558,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v3, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v5, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -1975,8 +1975,8 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s6, 0xffed2705 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb +; CGP-NEXT: s_mov_b32 s7, 0xffed2705 +; CGP-NEXT: s_mov_b32 s6, 0x12d8fb ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1987,7 +1987,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v6 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6] ; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] ; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 @@ -2011,11 +2011,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v9, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v8, v5, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; CGP-NEXT: v_mov_b32_e32 v4, v7 -; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v11, vcc, v1, v5, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8] @@ -2063,11 +2063,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; CGP-NEXT: v_mul_hi_u32 v9, v11, v0 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v1, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v6, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v6, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 ; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v6, vcc ; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v6 @@ -2090,10 +2090,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v14, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v15, -1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[1:2] ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v4 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v12, vcc @@ -2123,12 +2123,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v0 ; CGP-NEXT: v_addc_u32_e32 v13, vcc, v14, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0 ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc ; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc ; CGP-NEXT: v_xor_b32_e32 v9, v6, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2] ; CGP-NEXT: v_xor_b32_e32 v1, v8, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7] @@ -2179,11 +2179,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v12, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s7, v3, 0 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v3, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s7, v5, v[3:4] +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[3:4] ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 ; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 @@ -2237,135 +2237,137 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] ; CHECK-NEXT: .LBB7_3: -; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v1, v0 -; CHECK-NEXT: v_xor_b32_e32 v6, v2, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v5 -; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CHECK-NEXT: v_trunc_f32_e32 v2, v1 -; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 -; CHECK-NEXT: v_mul_lo_u32 v12, v7, v1 -; CHECK-NEXT: v_mul_lo_u32 v13, v10, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v1 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 +; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 +; CHECK-NEXT: v_trunc_f32_e32 v7, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] +; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v0 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v7, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v10, v[1:2] -; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v4 -; CHECK-NEXT: v_mul_hi_u32 v11, v7, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v7, v[1:2] -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v8, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v2, v10, v0 -; CHECK-NEXT: v_mul_lo_u32 v9, v7, v1 -; CHECK-NEXT: v_mul_hi_u32 v0, v10, v0 -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v11, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, v7, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_mul_hi_u32 v1, v10, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v3, v0 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v9, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, v6 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9 +; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 +; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v9, v[1:2] -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v3, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v3, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v5 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v6 -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v8 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2502,168 +2504,168 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v5, v8, v[1:2] ; GISEL-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v12, v0 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v14, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v0 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], v13, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v1, v6, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v0, v7, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v0, v7, vcc ; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v1, v6, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v11, v0, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v1, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v10, v0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v11 -; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v1, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v8 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v15, s[4:5], 0, v9, vcc ; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v7 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, v15, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, v10, v1, s[4:5] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v16, v1 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v10, v1 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v0 -; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v11 -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v10, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, v8, s[4:5] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v6, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v18, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v17, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v20, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v17, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v14, v20, vcc -; GISEL-NEXT: v_mul_hi_u32 v14, v17, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v20, v10 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v9, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v17, v[9:10] +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v7, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v1, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v17, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v16, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GISEL-NEXT: v_mul_hi_u32 v5, v16, v5 +; GISEL-NEXT: v_mul_lo_u32 v11, v20, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_mul_hi_u32 v10, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v20, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v17, v0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v16, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v7, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v9, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v18, v14, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v8, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v19, v7, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v8 -; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v0 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v20, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v18, v11, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v18, v15, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v7, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v19, v11, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; GISEL-NEXT: v_xor_b32_e32 v12, v2, v7 +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v13, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v9 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v15, v2, vcc ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v7, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_mul_lo_u32 v10, v13, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, v[5:6] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v5, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc ; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v11 -; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v11 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v10 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v7 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom: @@ -2683,131 +2685,131 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz .LBB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v4, v2, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v4, vcc -; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 -; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; CGP-NEXT: v_trunc_f32_e32 v2, v1 -; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v0 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[1:2] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v10, v[1:2] -; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v1 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 +; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v1 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v1 +; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v4, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4] +; CGP-NEXT: v_mul_hi_u32 v14, v10, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v4, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v16, v13, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v1 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v16, v2 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v1, v13, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v11, v13, v[1:2] +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4] ; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v10, v[1:2] -; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v11 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v2, v11 -; CGP-NEXT: v_mul_lo_u32 v2, v13, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v13, v0 +; CGP-NEXT: v_xor_b32_e32 v9, v4, v11 +; CGP-NEXT: v_mul_lo_u32 v4, v13, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 ; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v10, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v1, v13, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v13, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v1 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v8, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v9, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v8, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v9, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v0, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v12, 0 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v2, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v12, v[1:2] -; CGP-NEXT: v_subb_u32_e64 v2, s[4:5], v8, v1, vcc -; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v12, v[3:4] +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v8, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v0, v3 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v0 +; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v9, v0 ; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v11 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v11 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 @@ -2854,135 +2856,137 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] ; CGP-NEXT: .LBB8_7: -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v10 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v3, v2 -; CGP-NEXT: v_xor_b32_e32 v8, v4, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v8 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v6 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CGP-NEXT: v_trunc_f32_e32 v4, v3 -; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v9, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v13, v9, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v3 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v3 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v10 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v3 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v10, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v3 +; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v6, v6 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v9, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v12, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v10 -; CGP-NEXT: v_mul_lo_u32 v4, v12, v2 -; CGP-NEXT: v_mul_lo_u32 v11, v9, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v12, v2 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v3, v12, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mov_b32_e32 v4, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 +; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 +; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v12, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v3 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v5, v3 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v5, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v8 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v6 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -3166,157 +3170,157 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1] ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v0 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v10 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v4, v5, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v6 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0 +; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v5, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v7, v1 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v8, v1 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v6, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v7, v5 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v0 -; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v9 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v10, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v6 +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc +; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v2 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v18, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v1 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v17, v15, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v19, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v18, v4 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v5 -; GISEL-NEXT: v_subb_u32_e32 v20, vcc, v2, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v2, v15, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v0, v18, v5 +; GISEL-NEXT: v_mul_lo_u32 v19, v15, v6 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v13, v3, vcc +; GISEL-NEXT: v_mul_hi_u32 v13, v15, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v18, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v18, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v18, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v4, 0 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v13, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v5, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v20, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v4, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v6, v14, v18, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v18, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 +; GISEL-NEXT: v_mul_hi_u32 v19, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v19 +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v6, v18, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v10, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1] +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v2 -; GISEL-NEXT: v_mul_lo_u32 v7, v4, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v4, v2 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v5 +; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v12 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v3, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v2, v5, v2 -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v4, v0 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v0 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v13, v7 -; GISEL-NEXT: v_mul_hi_u32 v0, v5, v0 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], v7, v3 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v3 -; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v4, v2 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v5, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v2 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v2, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, 0 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v4 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v4, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v7 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v2, v9 -; GISEL-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v3, v2 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v10 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v6, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v6, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index fd244d3bf2def..65455d754be4f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -6251,15 +6251,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; ; GFX10-LABEL: s_ssubsat_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_u32 s16, s0, s8 -; GFX10-NEXT: s_subb_u32 s17, s1, s9 -; GFX10-NEXT: s_subb_u32 s18, s2, s10 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] -; GFX10-NEXT: s_subb_u32 s19, s3, s11 -; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] +; GFX10-NEXT: s_sub_u32 s18, s0, s8 +; GFX10-NEXT: s_subb_u32 s19, s1, s9 +; GFX10-NEXT: s_subb_u32 s16, s2, s10 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] +; GFX10-NEXT: s_subb_u32 s17, s3, s11 +; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] ; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: s_and_b32 s0, 1, s20 @@ -6268,7 +6268,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_ashr_i32 s8, s19, 31 +; GFX10-NEXT: s_ashr_i32 s8, s17, 31 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 @@ -6304,12 +6304,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, s16 -; GFX10-NEXT: v_mov_b32_e32 v4, s17 +; GFX10-NEXT: v_mov_b32_e32 v3, s18 +; GFX10-NEXT: v_mov_b32_e32 v4, s19 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: v_mov_b32_e32 v0, s16 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, s19 +; GFX10-NEXT: v_mov_b32_e32 v2, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index df8f3a702e885..77737b356ff6e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -365,61 +365,61 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_udiv_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 -; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 +; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 +; GISEL-NEXT: v_trunc_f32_e32 v14, v14 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -435,166 +435,166 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v1, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v19 ; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v18, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v10 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 -; GISEL-NEXT: v_add_i32_e64 v8, s[12:13], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v17, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v8 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 +; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 +; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 +; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v6 ; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v4 ; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v20, v4, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v4 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v6, v11 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, v6, v12 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v11, s[6:7] +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], 0, v12, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v17, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[16:17] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v20, v4 -; GISEL-NEXT: v_addc_u32_e64 v20, s[6:7], 0, v0, s[10:11] -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v12, v18 -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], 0, v2, s[12:13] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v14 -; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], v1, v12, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v12 -; GISEL-NEXT: v_subb_u32_e64 v12, s[6:7], v3, v4, s[8:9] +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[16:17] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v19, v4 +; GISEL-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v0, s[10:11] +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v17 +; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v2, s[12:13] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], v1, v16, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v16 +; GISEL-NEXT: v_subb_u32_e64 v16, s[6:7], v3, v4, s[8:9] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[22:23] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v11, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v7 ; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[18:19] ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v17, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v20, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, v5 @@ -602,19 +602,19 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[8:9] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v15, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v20, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v18, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v13, v18, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v19, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v2, v17, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64: @@ -1250,61 +1250,61 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 ; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 ; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v5 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 -; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v11 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 +; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 +; GISEL-NEXT: v_trunc_f32_e32 v14, v14 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -1320,144 +1320,144 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v6, v6, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v6 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v11, v6, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc +; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v15, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v21, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v10 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v18, v6 +; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v19 ; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v9 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v9 -; GISEL-NEXT: v_mul_lo_u32 v19, v4, v10 -; GISEL-NEXT: v_mul_lo_u32 v20, v5, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v9 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v16 -; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 1, v10 -; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v19 -; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], 1, v15 -; GISEL-NEXT: v_add_i32_e64 v6, s[12:13], v6, v13 -; GISEL-NEXT: v_add_i32_e64 v13, s[12:13], 1, v16 -; GISEL-NEXT: v_add_i32_e64 v11, s[14:15], v11, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v6 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v18, v4, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v6 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 1, v9 +; GISEL-NEXT: v_sub_i32_e64 v2, s[8:9], v2, v18 +; GISEL-NEXT: v_add_i32_e64 v18, s[10:11], 1, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[12:13], v15, v10 +; GISEL-NEXT: v_add_i32_e64 v15, s[12:13], 1, v14 +; GISEL-NEXT: v_add_i32_e64 v12, s[14:15], v21, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[14:15], v0, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[16:17], v2, v4 ; GISEL-NEXT: v_sub_i32_e64 v0, s[18:19], v0, v7 ; GISEL-NEXT: v_sub_i32_e64 v2, s[20:21], v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v20, v7, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[22:23], v0, v7 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_mul_lo_u32 v2, v4, v11 -; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v17, v12 -; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v11, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v20, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[14:15] -; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v18 +; GISEL-NEXT: v_mul_lo_u32 v2, v4, v12 +; GISEL-NEXT: v_add_i32_e64 v4, s[24:25], v16, v20 +; GISEL-NEXT: v_addc_u32_e64 v7, s[6:7], 0, v12, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], v19, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[14:15] +; GISEL-NEXT: v_add_i32_e64 v4, s[6:7], v4, v17 ; GISEL-NEXT: v_subb_u32_e64 v17, s[6:7], v1, v4, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[16:17] @@ -1470,18 +1470,18 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[10:11], v1, v8 ; GISEL-NEXT: v_addc_u32_e64 v1, s[12:13], 0, v7, s[12:13] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[22:23] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v14, vcc, v3, v2, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_subb_u32_e64 v11, vcc, v3, v2, s[8:9] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v14, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5 ; GISEL-NEXT: v_subb_u32_e64 v2, s[8:9], v2, v5, s[8:9] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v2, vcc, 0, v2, s[20:21] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[14:15] -; GISEL-NEXT: v_cndmask_b32_e64 v4, v14, v4, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[14:15] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v4, s[8:9] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] @@ -1489,17 +1489,17 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[10:11] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v18, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v19, s[6:7] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v2 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v15, v19, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v16, v13, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v13, v18, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v15, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v4, v0, v17, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v1, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v12, v5, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 56943531ba8ae..fba8ef2948ade 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -1072,185 +1072,185 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v5, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v5, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v4, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v4, s13 +; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v1, v2, s[0:1] -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v0, v3, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v7 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v4, vcc -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v5 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX8-NEXT: v_trunc_f32_e32 v14, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v0 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v6, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1] +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8 +; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v6 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX8-NEXT: v_trunc_f32_e32 v14, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v14 +; GFX8-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v2, v16, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v14, v[1:2] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3] ; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12 ; GFX8-NEXT: v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v15, v[1:2] -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v4, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v14, v0 -; GFX8-NEXT: v_mul_lo_u32 v4, v15, v1 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v15, v[2:3] +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1 +; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2 ; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v2, vcc -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v0 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v14, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, v14, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 -; GFX8-NEXT: v_mul_hi_u32 v4, v15, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc +; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_lo_u32 v4, v14, v2 ; GFX8-NEXT: v_mul_hi_u32 v1, v14, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v5, v15, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v1 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, v14, v2, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v12, v17, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v14, v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v18, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v15, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v15, v[4:5] ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v19, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v6, v14, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GFX8-NEXT: v_mul_hi_u32 v7, v15, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v20, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v6, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v10, v19, s[0:1] ; GFX8-NEXT: v_mul_lo_u32 v7, v14, v3 -; GFX8-NEXT: v_mul_hi_u32 v2, v14, v2 -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, v15, v3 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v7, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v9, v15, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, v15, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v11, v20, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v8, v14, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v14, v3 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v7, v6 -; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 -; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v15, v2 -; GFX8-NEXT: v_addc_u32_e64 v3, s[0:1], v14, v3, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_mul_hi_u32 v8, s10, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, s11, v3 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_mul_hi_u32 v7, s10, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v9, v7 +; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v8, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v9 +; GFX8-NEXT: v_mul_hi_u32 v4, v14, v4 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v8, v7 +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v4, v7 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v15, v3 +; GFX8-NEXT: v_addc_u32_e64 v4, s[0:1], v14, v4, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3 +; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc +; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0 +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s11, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s14, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s11 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s15, v8, v[6:7] -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v2 -; GFX8-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s11, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s14, v7 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v2, vcc -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v8 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v9, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3 +; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8 +; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc +; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9 +; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12 -; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v11 +; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13 -; GFX8-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v2, vcc +; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v14, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v8, v12, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v8, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v9, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1] +; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udivrem_v2i64: @@ -1298,7 +1298,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s14 @@ -1338,181 +1337,183 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v6, 0 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v8, v3, v0, v5 +; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v8, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s8, v1 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v6, v[2:3] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v4, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v10 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v3, s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s15 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s12, v9 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 1, v6 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 -; GFX9-NEXT: v_trunc_f32_e32 v15, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v15 -; GFX9-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v1 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v12 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v1 +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1] +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s15 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s14 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s12, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v8 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; GFX9-NEXT: v_trunc_f32_e32 v15, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0xcf800000, v15 +; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v16, 0 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v17, v3, v17, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13 ; GFX9-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v16, v[2:3] -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v16, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v12 -; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v16, v1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v16, v[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 +; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v11 +; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX9-NEXT: v_mul_hi_u32 v1, v15, v1 -; GFX9-NEXT: v_add_u32_e32 v3, v7, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v16, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v16, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v16, v1 -; GFX9-NEXT: v_add3_u32 v2, v4, v3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v7, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v3 +; GFX9-NEXT: v_add3_u32 v4, v6, v5, v4 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v16, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v18, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2] -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v19, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v7, v[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v18, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4] +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v19, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v16, v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v13, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v20, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v21, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v12, v15, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 -; GFX9-NEXT: v_mul_hi_u32 v8, v7, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v12, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v8 +; GFX9-NEXT: v_mul_lo_u32 v8, v15, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v16, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v20, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v11, v16, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v21, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 -; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_add3_u32 v4, v8, v6, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v7, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], v15, v4, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v6, s11, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, s10, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, s10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v6, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s11, v4 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, s10, v4 -; GFX9-NEXT: v_mul_hi_u32 v13, s11, v4 -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v8, v3 +; GFX9-NEXT: v_mul_lo_u32 v11, v15, v6 +; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 +; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX9-NEXT: v_mul_hi_u32 v9, v16, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v15, v6 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v11, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v8 +; GFX9-NEXT: v_add_u32_e32 v9, v11, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v3, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc -; GFX9-NEXT: v_add3_u32 v10, v7, v12, v13 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v11, s11 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8] -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, s10, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v11, s[0:1], v11, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11 -; GFX9-NEXT: v_sub_u32_e32 v3, s11, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v12, vcc, s14, v8 -; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v16, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v15, v6, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc +; GFX9-NEXT: v_mul_hi_u32 v2, s10, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, s11, v5 +; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], v8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6 +; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, s10, v6 +; GFX9-NEXT: v_mul_hi_u32 v13, s11, v6 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v8, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s14, v12, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v10, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v11, v9 +; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v5 +; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10 +; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s14, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v13 -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v12 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v3, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v18, s[0:1], 0, v15, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v14, v17, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v15, v18, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v13, v20, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v9, s[0:1] -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v18, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1] +; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 09e39569a5abb..097f6642cbc66 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -359,61 +359,61 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-LABEL: v_urem_v2i64: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v7 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v6 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v7, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 -; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 +; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 +; GISEL-NEXT: v_trunc_f32_e32 v14, v14 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v8, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -429,102 +429,102 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v8, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v10 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 +; GISEL-NEXT: v_mul_lo_u32 v8, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v13, v8 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v13, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] ; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] @@ -533,28 +533,28 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v5, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v4, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v6, v11 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 ; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 ; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 ; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 ; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 +; GISEL-NEXT: v_mul_lo_u32 v9, v6, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 @@ -562,11 +562,11 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 ; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 ; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 +; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v10 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 ; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] @@ -1751,63 +1751,63 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b64 s[4:5], 0x1000 -; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 -; GISEL-NEXT: v_lshl_b64 v[6:7], s[4:5], v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v7 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v5, vcc -; GISEL-NEXT: v_subb_u32_e64 v15, vcc, 0, v7, s[4:5] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v12 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v11 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 +; GISEL-NEXT: v_lshl_b64 v[7:8], s[4:5], v4 +; GISEL-NEXT: v_lshl_b64 v[4:5], s[4:5], v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v7 +; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v5 +; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], 0, v4 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v8, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, vcc, 0, v5, s[4:5] +; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; GISEL-NEXT: v_mac_f32_e32 v13, 0x4f800000, v14 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v11, v13 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v9 -; GISEL-NEXT: v_mul_f32_e32 v12, 0x2f800000, v10 -; GISEL-NEXT: v_trunc_f32_e32 v11, v11 -; GISEL-NEXT: v_trunc_f32_e32 v12, v12 -; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v11 +; GISEL-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 +; GISEL-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10 +; GISEL-NEXT: v_mul_f32_e32 v14, 0x2f800000, v11 +; GISEL-NEXT: v_trunc_f32_e32 v13, v13 +; GISEL-NEXT: v_trunc_f32_e32 v14, v14 +; GISEL-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GISEL-NEXT: v_mac_f32_e32 v11, 0xcf800000, v14 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_mul_lo_u32 v17, v13, v12 -; GISEL-NEXT: v_mul_lo_u32 v18, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v10 -; GISEL-NEXT: v_mul_hi_u32 v20, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v20, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_lo_u32 v19, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v19, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v18 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v18 ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v19, v20 -; GISEL-NEXT: v_mul_lo_u32 v19, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v19, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v20, v15, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v20, v16 -; GISEL-NEXT: v_mul_hi_u32 v20, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v20, v6, v10 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v19 -; GISEL-NEXT: v_mul_lo_u32 v21, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v19 +; GISEL-NEXT: v_mul_lo_u32 v21, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v20, s[6:7], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v21, v9, v19 +; GISEL-NEXT: v_mul_hi_u32 v21, v10, v19 ; GISEL-NEXT: v_add_i32_e64 v20, s[8:9], v20, v21 -; GISEL-NEXT: v_mul_hi_u32 v19, v11, v19 -; GISEL-NEXT: v_mul_hi_u32 v18, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v20, v11, v16 +; GISEL-NEXT: v_mul_hi_u32 v19, v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v18 +; GISEL-NEXT: v_mul_lo_u32 v20, v13, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v17 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[12:13], v20, v18 -; GISEL-NEXT: v_mul_hi_u32 v20, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v20, v10, v16 ; GISEL-NEXT: v_add_i32_e64 v19, s[14:15], v19, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v10, v17 +; GISEL-NEXT: v_mul_hi_u32 v20, v11, v17 ; GISEL-NEXT: v_add_i32_e64 v18, s[16:17], v18, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[8:9] @@ -1823,102 +1823,102 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v23, vcc, v23, v24 ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v20 ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v22 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v19 -; GISEL-NEXT: v_mul_hi_u32 v16, v11, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v12, v17 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v18 +; GISEL-NEXT: v_add_i32_e64 v10, s[6:7], v10, v19 +; GISEL-NEXT: v_mul_hi_u32 v16, v13, v16 +; GISEL-NEXT: v_mul_hi_u32 v17, v14, v17 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v21, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v19, vcc, v23, v19 -; GISEL-NEXT: v_mul_lo_u32 v20, v8, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v14, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v9 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v10 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v15, v15, v10 -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v11, v16, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v16, v13, v10 -; GISEL-NEXT: v_addc_u32_e64 v12, vcc, v12, v17, s[8:9] -; GISEL-NEXT: v_mul_hi_u32 v17, v9, v20 -; GISEL-NEXT: v_mul_lo_u32 v8, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v19 -; GISEL-NEXT: v_mul_lo_u32 v13, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_lo_u32 v15, v11, v20 -; GISEL-NEXT: v_mul_hi_u32 v20, v11, v20 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v12, v19 -; GISEL-NEXT: v_mul_hi_u32 v19, v12, v19 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v11, v8 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v13 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v12, v11 +; GISEL-NEXT: v_addc_u32_e64 v13, vcc, v13, v16, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v16, v9, v11 +; GISEL-NEXT: v_addc_u32_e64 v14, vcc, v14, v17, s[8:9] +; GISEL-NEXT: v_mul_hi_u32 v17, v10, v20 +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 +; GISEL-NEXT: v_mul_hi_u32 v15, v11, v19 +; GISEL-NEXT: v_mul_lo_u32 v9, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v13, v20 +; GISEL-NEXT: v_mul_hi_u32 v20, v13, v20 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v13, v6 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, v11, v9 ; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], v16, v20 -; GISEL-NEXT: v_mul_lo_u32 v20, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v20, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v17, s[8:9], v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v10, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v11, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v9 ; GISEL-NEXT: v_add_i32_e64 v19, s[10:11], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v15 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v16, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[10:11] ; GISEL-NEXT: v_add_i32_e64 v18, s[8:9], v19, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v19, s[4:5], v20, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v20 +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v20 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[8:9] ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v20 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v19 ; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v17 -; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], v14, v18 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17 +; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v15, v18 ; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], v16, v19 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v2, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v14 -; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v15 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v11, vcc, v12, v13, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v11 -; GISEL-NEXT: v_mul_lo_u32 v20, v3, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_mul_hi_u32 v16, v2, v11 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v10 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v2, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v3, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v12 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v15 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc +; GISEL-NEXT: v_addc_u32_e64 v9, vcc, v14, v9, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v0, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, v1, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v20, v3, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v18, v15 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v20, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v20, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 ; GISEL-NEXT: v_add_i32_e64 v13, s[6:7], v13, v19 -; GISEL-NEXT: v_add_i32_e64 v10, s[8:9], v10, v16 +; GISEL-NEXT: v_add_i32_e64 v11, s[8:9], v11, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] @@ -1927,80 +1927,80 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v20, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v16, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v17, v5, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v6, v10 +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 +; GISEL-NEXT: v_mul_lo_u32 v18, v4, v11 +; GISEL-NEXT: v_mul_lo_u32 v19, v5, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v18 -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v13 -; GISEL-NEXT: v_add_i32_e64 v11, s[6:7], v11, v12 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v6 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v4 -; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v4, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v11 +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v13 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v12 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v2, v4 +; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], v0, v7 +; GISEL-NEXT: v_sub_i32_e64 v13, s[12:13], v2, v4 +; GISEL-NEXT: v_mul_lo_u32 v6, v7, v6 +; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v6 -; GISEL-NEXT: v_sub_i32_e64 v4, s[14:15], v12, v4 -; GISEL-NEXT: v_sub_i32_e64 v6, s[16:17], v13, v6 -; GISEL-NEXT: v_add_i32_e64 v8, s[18:19], v17, v8 -; GISEL-NEXT: v_add_i32_e64 v11, s[18:19], v19, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v13, v4 +; GISEL-NEXT: v_sub_i32_e64 v7, s[14:15], v12, v7 +; GISEL-NEXT: v_sub_i32_e64 v4, s[16:17], v13, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[18:19], v17, v6 +; GISEL-NEXT: v_add_i32_e64 v9, s[18:19], v19, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[8:9] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v8, v9 -; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v11, v10 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], v3, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[6:7], v6, v10 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], v9, v11 +; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v1, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 +; GISEL-NEXT: v_subb_u32_e64 v6, s[6:7], v3, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v7 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v8, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v18, vcc, 0, v1, s[10:11] -; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v5, s[10:11] +; GISEL-NEXT: v_subb_u32_e64 v1, vcc, v1, v8, s[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e64 v14, vcc, 0, v3, s[12:13] -; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v7, s[12:13] +; GISEL-NEXT: v_subb_u32_e64 v3, vcc, v3, v5, s[12:13] ; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v8 ; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[14:15] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v5 ; GISEL-NEXT: v_subbrev_u32_e64 v3, s[6:7], 0, v3, s[16:17] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v5 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v8 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], v14, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[8:9] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v6, s[8:9] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, v12, v7, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v4, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v1, v18, v1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v3, v14, v3, s[8:9] -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index e8ceeece372d4..7c846ed551713 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -581,35 +581,35 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 ; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11] +; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s5, v2 ; GFX908-NEXT: v_readfirstlane_b32 s9, v3 ; GFX908-NEXT: s_add_u32 s5, s5, 1 ; GFX908-NEXT: s_addc_u32 s9, s9, 0 -; GFX908-NEXT: s_mul_hi_u32 s19, s2, s5 +; GFX908-NEXT: s_mul_hi_u32 s21, s2, s5 ; GFX908-NEXT: s_mul_i32 s22, s3, s5 -; GFX908-NEXT: s_mul_i32 s18, s2, s5 +; GFX908-NEXT: s_mul_i32 s20, s2, s5 ; GFX908-NEXT: s_mul_i32 s5, s2, s9 -; GFX908-NEXT: s_add_i32 s5, s19, s5 +; GFX908-NEXT: s_add_i32 s5, s21, s5 ; GFX908-NEXT: s_add_i32 s5, s5, s22 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s0 +; GFX908-NEXT: s_add_u32 s18, s18, s0 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s1 +; GFX908-NEXT: s_addc_u32 s19, s19, s1 ; GFX908-NEXT: s_mov_b64 s[22:23], 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s22, s20, s18 -; GFX908-NEXT: s_addc_u32 s23, s21, s5 +; GFX908-NEXT: s_add_u32 s22, s18, s20 +; GFX908-NEXT: s_addc_u32 s23, s19, s5 ; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc @@ -657,7 +657,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1 @@ -742,26 +742,26 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 -; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11] +; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 ; GFX90A-NEXT: s_add_u32 s5, s5, 1 ; GFX90A-NEXT: s_addc_u32 s9, s9, 0 -; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s5 +; GFX90A-NEXT: s_mul_hi_u32 s21, s2, s5 ; GFX90A-NEXT: s_mul_i32 s22, s3, s5 -; GFX90A-NEXT: s_mul_i32 s18, s2, s5 +; GFX90A-NEXT: s_mul_i32 s20, s2, s5 ; GFX90A-NEXT: s_mul_i32 s5, s2, s9 -; GFX90A-NEXT: s_add_i32 s5, s19, s5 +; GFX90A-NEXT: s_add_i32 s5, s21, s5 ; GFX90A-NEXT: s_add_i32 s5, s5, s22 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s0 -; GFX90A-NEXT: s_addc_u32 s21, s21, s1 +; GFX90A-NEXT: s_add_u32 s18, s18, s0 +; GFX90A-NEXT: s_addc_u32 s19, s19, s1 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[22:23], 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] @@ -769,8 +769,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s22, s20, s18 -; GFX90A-NEXT: s_addc_u32 s23, s21, s5 +; GFX90A-NEXT: s_add_u32 s22, s18, s20 +; GFX90A-NEXT: s_addc_u32 s23, s19, s5 ; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc @@ -811,7 +811,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GFX90A-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index edab417a03ced..1b8216f4aa2a6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -9219,19 +9219,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 +; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[8:9], s[12:13], s8 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s10 -; GFX6-NEXT: s_ashr_i32 s14, s9, 31 -; GFX6-NEXT: s_add_u32 s8, s8, s14 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s10 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX6-NEXT: s_ashr_i32 s14, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s14 ; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: s_addc_u32 s9, s9, s14 -; GFX6-NEXT: s_xor_b64 s[12:13], s[8:9], s[14:15] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_u32 s10, 0, s12 -; GFX6-NEXT: s_subb_u32 s11, 0, s13 +; GFX6-NEXT: s_addc_u32 s3, s3, s14 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[14:15] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_u32 s10, 0, s2 +; GFX6-NEXT: s_subb_u32 s11, 0, s3 ; GFX6-NEXT: s_ashr_i32 s16, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 @@ -9306,23 +9306,23 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] @@ -9332,22 +9332,22 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] ; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s4 +; GFX6-NEXT: s_ashr_i32 s4, s13, 31 +; GFX6-NEXT: s_add_u32 s12, s12, s4 ; GFX6-NEXT: v_mov_b32_e32 v6, s5 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: s_addc_u32 s3, s3, s4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: s_addc_u32 s13, s13, s4 +; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[4:5] ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 @@ -9356,16 +9356,16 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s12, 0, s2 +; GFX6-NEXT: s_sub_u32 s2, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 -; GFX6-NEXT: s_subb_u32 s13, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX6-NEXT: s_subb_u32 s3, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 @@ -9384,11 +9384,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9403,14 +9403,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s6, s6, s12 +; GFX6-NEXT: s_add_u32 s6, s6, s2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s7, s7, s12 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s7, s7, s2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9426,25 +9426,25 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s3 +; GFX6-NEXT: v_mov_b32_e32 v7, s13 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s2, v5 +; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 ; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v7 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] @@ -9455,15 +9455,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] +; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -10454,17 +10454,17 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s10 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 ; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: s_addc_u32 s3, s3, s8 -; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[8:9] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 -; GFX6-NEXT: s_sub_u32 s2, 0, s16 -; GFX6-NEXT: s_subb_u32 s3, 0, s17 +; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX6-NEXT: s_sub_u32 s2, 0, s14 +; GFX6-NEXT: s_subb_u32 s3, 0, s15 ; GFX6-NEXT: s_ashr_i32 s12, s5, 31 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 @@ -10538,46 +10538,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 -; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s14, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s15, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, s14, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v3, s15 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v0 +; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s14, v0 ; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s15, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v4 +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v4 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v5 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s15, v5 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s14, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX6-NEXT: s_ashr_i32 s0, s15, 31 -; GFX6-NEXT: s_add_u32 s2, s14, s0 +; GFX6-NEXT: s_ashr_i32 s0, s17, 31 +; GFX6-NEXT: s_add_u32 s2, s16, s0 ; GFX6-NEXT: s_mov_b32 s1, s0 -; GFX6-NEXT: s_addc_u32 s3, s15, s0 +; GFX6-NEXT: s_addc_u32 s3, s17, s0 ; GFX6-NEXT: v_mov_b32_e32 v4, s5 ; GFX6-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 ; GFX6-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX6-NEXT: v_rcp_f32_e32 v4, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 9a2e7874ea1f4..5990736f664fb 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -17,7 +17,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 @@ -33,7 +33,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -46,7 +46,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF @@ -59,7 +59,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -67,7 +67,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec @@ -85,7 +85,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -117,7 +117,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr21 = COPY renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr17, implicit $exec @@ -130,9 +130,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec ; GFX90A-NEXT: {{ $}} @@ -151,28 +151,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec @@ -181,15 +181,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec @@ -198,7 +198,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.15.bb72: ; GFX90A-NEXT: successors: %bb.16(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc @@ -208,11 +208,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr13 = COPY killed renamable $sgpr15 ; GFX90A-NEXT: $sgpr14 = COPY killed renamable $sgpr16 ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.16.Flow36: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec @@ -221,15 +221,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.17.bb67: ; GFX90A-NEXT: successors: %bb.18(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.18.Flow37: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec @@ -238,15 +238,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.19.bb62: ; GFX90A-NEXT: successors: %bb.20(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.20.Flow38: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec @@ -255,15 +255,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.21.bb54: ; GFX90A-NEXT: successors: %bb.22(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.22.Flow39: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec @@ -272,15 +272,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.23.bb47: ; GFX90A-NEXT: successors: %bb.24(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.24.Flow40: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec @@ -289,15 +289,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.25.bb40: ; GFX90A-NEXT: successors: %bb.26(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.26.Flow41: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec @@ -306,15 +306,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.27.bb33: ; GFX90A-NEXT: successors: %bb.28(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.28.Flow42: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec @@ -323,7 +323,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.29.Flow43: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc @@ -331,17 +331,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.30.bb19: ; GFX90A-NEXT: successors: %bb.31(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.31.Flow44: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr58_sgpr59, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.33, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock: @@ -357,22 +357,22 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.34.bb26: ; GFX90A-NEXT: successors: %bb.29(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.29 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -406,14 +406,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec @@ -443,7 +443,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -455,16 +455,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr58_sgpr59, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec @@ -493,7 +493,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -504,16 +504,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr58_sgpr59, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -545,7 +545,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr20, implicit $exec @@ -556,16 +556,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr58_sgpr59, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc @@ -579,7 +579,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF @@ -599,14 +599,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc @@ -615,7 +615,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -648,7 +648,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -657,30 +657,30 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr58_sgpr59, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc @@ -688,7 +688,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 @@ -708,7 +708,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc @@ -737,7 +737,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec @@ -762,14 +762,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec @@ -778,8 +778,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr58, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr59, killed $vgpr5, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr56, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr5, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec @@ -793,7 +793,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 @@ -826,7 +826,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) @@ -845,7 +845,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec @@ -879,16 +879,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr58_sgpr59, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -896,14 +896,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr4, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec @@ -912,21 +912,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY $exec + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr23, killed $vgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr27, implicit $exec @@ -946,7 +946,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr28 = V_OR_B32_e32 1, $vgpr26, implicit $exec ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr24, implicit $exec @@ -963,14 +963,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr27, implicit $exec @@ -999,14 +999,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr5 = V_OR_B32_e32 $vgpr52, killed $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 5cc88343faffa..e4c7df385d861 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -189,29 +189,29 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v8i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s12, 0xff00ff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s3, s3, 8 -; SI-NEXT: v_alignbit_b32 v1, s3, s3, 24 -; SI-NEXT: v_alignbit_b32 v2, s2, s2, 8 -; SI-NEXT: v_alignbit_b32 v4, s2, s2, 24 -; SI-NEXT: v_alignbit_b32 v5, s1, s1, 8 -; SI-NEXT: v_alignbit_b32 v6, s1, s1, 24 -; SI-NEXT: v_alignbit_b32 v7, s0, s0, 8 -; SI-NEXT: v_alignbit_b32 v8, s0, s0, 24 -; SI-NEXT: v_alignbit_b32 v9, s7, s7, 8 -; SI-NEXT: v_alignbit_b32 v10, s7, s7, 24 -; SI-NEXT: v_alignbit_b32 v11, s6, s6, 8 -; SI-NEXT: v_alignbit_b32 v12, s6, s6, 24 -; SI-NEXT: v_alignbit_b32 v13, s5, s5, 8 -; SI-NEXT: v_alignbit_b32 v14, s5, s5, 24 -; SI-NEXT: v_alignbit_b32 v15, s4, s4, 8 -; SI-NEXT: v_alignbit_b32 v16, s4, s4, 24 +; SI-NEXT: v_alignbit_b32 v0, s7, s7, 8 +; SI-NEXT: v_alignbit_b32 v1, s7, s7, 24 +; SI-NEXT: v_alignbit_b32 v2, s6, s6, 8 +; SI-NEXT: v_alignbit_b32 v4, s6, s6, 24 +; SI-NEXT: v_alignbit_b32 v5, s5, s5, 8 +; SI-NEXT: v_alignbit_b32 v6, s5, s5, 24 +; SI-NEXT: v_alignbit_b32 v7, s4, s4, 8 +; SI-NEXT: v_alignbit_b32 v8, s4, s4, 24 +; SI-NEXT: v_alignbit_b32 v9, s11, s11, 8 +; SI-NEXT: v_alignbit_b32 v10, s11, s11, 24 +; SI-NEXT: v_alignbit_b32 v11, s10, s10, 8 +; SI-NEXT: v_alignbit_b32 v12, s10, s10, 24 +; SI-NEXT: v_alignbit_b32 v13, s9, s9, 8 +; SI-NEXT: v_alignbit_b32 v14, s9, s9, 24 +; SI-NEXT: v_alignbit_b32 v15, s8, s8, 8 +; SI-NEXT: v_alignbit_b32 v16, s8, s8, 24 ; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 ; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 ; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 @@ -220,8 +220,8 @@ define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace ; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 ; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 ; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_bswap_v8i32: @@ -398,29 +398,29 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { ; SI-LABEL: test_bswap_v4i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s12, 0xff00ff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 -; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 -; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 -; SI-NEXT: v_alignbit_b32 v4, s3, s3, 24 -; SI-NEXT: v_alignbit_b32 v5, s0, s0, 8 -; SI-NEXT: v_alignbit_b32 v6, s0, s0, 24 -; SI-NEXT: v_alignbit_b32 v7, s1, s1, 8 -; SI-NEXT: v_alignbit_b32 v8, s1, s1, 24 -; SI-NEXT: v_alignbit_b32 v9, s6, s6, 8 -; SI-NEXT: v_alignbit_b32 v10, s6, s6, 24 -; SI-NEXT: v_alignbit_b32 v11, s7, s7, 8 -; SI-NEXT: v_alignbit_b32 v12, s7, s7, 24 -; SI-NEXT: v_alignbit_b32 v13, s4, s4, 8 -; SI-NEXT: v_alignbit_b32 v14, s4, s4, 24 -; SI-NEXT: v_alignbit_b32 v15, s5, s5, 8 -; SI-NEXT: v_alignbit_b32 v16, s5, s5, 24 +; SI-NEXT: v_alignbit_b32 v0, s6, s6, 8 +; SI-NEXT: v_alignbit_b32 v1, s6, s6, 24 +; SI-NEXT: v_alignbit_b32 v2, s7, s7, 8 +; SI-NEXT: v_alignbit_b32 v4, s7, s7, 24 +; SI-NEXT: v_alignbit_b32 v5, s4, s4, 8 +; SI-NEXT: v_alignbit_b32 v6, s4, s4, 24 +; SI-NEXT: v_alignbit_b32 v7, s5, s5, 8 +; SI-NEXT: v_alignbit_b32 v8, s5, s5, 24 +; SI-NEXT: v_alignbit_b32 v9, s10, s10, 8 +; SI-NEXT: v_alignbit_b32 v10, s10, s10, 24 +; SI-NEXT: v_alignbit_b32 v11, s11, s11, 8 +; SI-NEXT: v_alignbit_b32 v12, s11, s11, 24 +; SI-NEXT: v_alignbit_b32 v13, s8, s8, 8 +; SI-NEXT: v_alignbit_b32 v14, s8, s8, 24 +; SI-NEXT: v_alignbit_b32 v15, s9, s9, 8 +; SI-NEXT: v_alignbit_b32 v16, s9, s9, 24 ; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 ; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 ; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 @@ -429,8 +429,8 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 ; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 ; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_bswap_v4i64: diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll index 7b7a67193d226..66ba818b400b6 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -10,7 +10,7 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: s_mov_b64 s[4:5], 0 ; ISA-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v0 ; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; ISA-NEXT: v_mov_b32_e32 v7, 0 +; ISA-NEXT: v_mov_b32_e32 v6, 0 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_lshr_b32 s6, s5, 1 ; ISA-NEXT: s_lshr_b32 s7, 1, s4 @@ -27,18 +27,18 @@ define void @f(i32 %arg, ptr %ptr) { ; ISA-NEXT: s_mov_b32 s4, 0 ; ISA-NEXT: .LBB0_1: ; %bb14 ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 -; ISA-NEXT: v_mov_b32_e32 v6, v7 +; ISA-NEXT: v_mov_b32_e32 v7, v6 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 -; ISA-NEXT: v_add_f32_e32 v7, v6, v0 -; ISA-NEXT: v_add_f32_e64 v7, v7, |v3| -; ISA-NEXT: v_add_f32_e32 v7, v7, v4 -; ISA-NEXT: v_add_f32_e32 v7, v7, v5 +; ISA-NEXT: v_add_f32_e32 v6, v7, v0 +; ISA-NEXT: v_add_f32_e64 v6, v6, |v3| +; ISA-NEXT: v_add_f32_e32 v6, v6, v4 +; ISA-NEXT: v_add_f32_e32 v6, v6, v5 ; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: s_cbranch_execnz .LBB0_1 ; ISA-NEXT: ; %bb.2: ; %bb21 ; ISA-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; ISA-NEXT: flat_store_dword v[1:2], v6 +; ISA-NEXT: flat_store_dword v[1:2], v7 ; ISA-NEXT: s_waitcnt lgkmcnt(0) ; ISA-NEXT: s_setpc_b64 s[30:31] ; MIR-LABEL: name: f diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 2338add43d06c..2184478635e0e 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -709,118 +709,118 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc -; GFX9-NEXT: v_xor_b32_e32 v10, v3, v9 -; GFX9-NEXT: v_xor_b32_e32 v11, v2, v9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v11 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v10 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v11 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc -; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v8, v6 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v6, 0 -; GFX9-NEXT: v_mul_lo_u32 v5, v7, v12 -; GFX9-NEXT: v_mul_hi_u32 v13, v6, v2 -; GFX9-NEXT: v_add3_u32 v5, v3, v5, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v7, v12 -; GFX9-NEXT: v_mul_lo_u32 v5, v8, v13 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v13, 0 -; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v3, v11, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v11 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v2 +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, 0, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc +; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v5 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v8, 0 +; GFX9-NEXT: v_mul_lo_u32 v7, v9, v12 +; GFX9-NEXT: v_mul_hi_u32 v13, v8, v4 +; GFX9-NEXT: v_add3_u32 v7, v5, v7, v6 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v5 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v4, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v9, v12 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v13 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v13, 0 +; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v7, 0 +; GFX9-NEXT: v_mul_hi_u32 v14, v13, v4 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v14, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v12, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc ; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7 -; GFX9-NEXT: v_xor_b32_e32 v5, v0, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v7, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v3, 0 -; GFX9-NEXT: v_mul_hi_u32 v6, v5, v2 -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v7 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v2 -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v2, 0 -; GFX9-NEXT: v_add3_u32 v1, v1, v8, v6 -; GFX9-NEXT: v_sub_u32_e32 v6, v4, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v5, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v10, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v11 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[6:7], 0, v6, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v11 +; GFX9-NEXT: v_xor_b32_e32 v8, v0, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v7, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 +; GFX9-NEXT: v_mul_hi_u32 v9, v8, v4 +; GFX9-NEXT: v_xor_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v9, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v10, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, v3, v5 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v4, 0 +; GFX9-NEXT: v_add3_u32 v1, v1, v10, v9 +; GFX9-NEXT: v_sub_u32_e32 v9, v6, v1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v9, v2, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v0, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[6:7] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v12, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v3, s[6:7] -; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v3, s[6:7] -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v16, v14, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v5, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v16, v14, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v15, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v5, v7, v9 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v5 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5 -; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v2, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[4:5], v6, v10, s[4:5] -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v3, v5, s[8:9] -; GFX9-NEXT: v_sub_co_u32_e64 v3, s[4:5], v8, v11 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[4:5], v8, v2, s[4:5] +; GFX9-NEXT: v_sub_co_u32_e64 v3, s[4:5], v9, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[4:5], 0, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v12, v2, s[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v15, v13, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v6, v7, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GFX9-NEXT: v_xor_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_xor_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v6 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7 +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v5, v6, s[8:9] ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v0, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index 3145c1c3e868b..91fab927be3af 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -164,32 +164,32 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s16, s4 -; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s16, s8 +; SI-NEXT: s_mov_b32 s17, s9 +; SI-NEXT: s_mov_b32 s20, s10 +; SI-NEXT: s_mov_b32 s21, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 0acd24dc5e3a0..26ccede123601 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -520,18 +520,18 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -557,7 +557,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 1adf93c8e17a5..855b5fff11fe5 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -876,16 +876,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 @@ -916,7 +916,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 ; SI-NEXT: v_min3_u32 v0, v0, v1, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index dbe2bba62bc9c..4202edfbd0eb4 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -1214,7 +1214,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1222,7 +1222,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1251,18 +1251,18 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v5 ; SI-NEXT: v_or_b32_e32 v3, v11, v2 -; SI-NEXT: v_or_b32_e32 v9, v9, v12 +; SI-NEXT: v_or_b32_e32 v8, v8, v12 ; SI-NEXT: v_or_b32_e32 v2, v10, v13 -; SI-NEXT: v_or_b32_e32 v8, v8, v14 +; SI-NEXT: v_or_b32_e32 v9, v9, v14 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB7_3 ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_2: -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -1271,7 +1271,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_mov_b32 s39, 0xf000 ; SI-NEXT: s_mov_b32 s36, s38 ; SI-NEXT: s_mov_b32 s37, s38 -; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1279,7 +1279,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1308,13 +1308,13 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v0 -; SI-NEXT: v_or_b32_e32 v9, v9, v1 +; SI-NEXT: v_or_b32_e32 v8, v8, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: .LBB7_4: ; %exit -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 @@ -1499,13 +1499,13 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:8 glc +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:10 glc +; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1528,30 +1528,30 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; SI-NEXT: v_or_b32_e32 v6, v10, v12 -; SI-NEXT: v_or_b32_e32 v9, v9, v13 -; SI-NEXT: v_or_b32_e32 v8, v8, v14 -; SI-NEXT: v_or_b32_e32 v10, v5, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_or_b32_e32 v9, v10, v12 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 +; SI-NEXT: v_or_b32_e32 v10, v7, v14 +; SI-NEXT: v_or_b32_e32 v11, v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 ; SI-NEXT: s_mov_b64 vcc, exec ; SI-NEXT: s_cbranch_execz .LBB8_3 ; SI-NEXT: s_branch .LBB8_4 ; SI-NEXT: .LBB8_2: ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr2 @@ -1562,9 +1562,9 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: s_mov_b32 s37, s38 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:2 glc +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:2 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc +; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:4 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1595,25 +1595,25 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v0, v9, v0 ; SI-NEXT: v_or_b32_e32 v1, v8, v1 -; SI-NEXT: v_or_b32_e32 v8, v6, v10 +; SI-NEXT: v_or_b32_e32 v8, v7, v10 ; SI-NEXT: v_or_b32_e32 v9, v5, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: .LBB8_4: ; %exit ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -1621,9 +1621,9 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 @@ -1631,11 +1631,11 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 ; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 ; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 ; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v10 ; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 5694a9dc1ffd4..8acc38eaf0170 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -230,6 +230,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 +; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 ; FLAT_SCR_OPT-NEXT: ; kill: killed $vgpr1 ; FLAT_SCR_OPT-NEXT: global_store_dword v2, v0, s[0:1] @@ -351,6 +353,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 +; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 ; FLAT_SCR_ARCH-NEXT: ; kill: killed $vgpr1 ; FLAT_SCR_ARCH-NEXT: global_store_dword v2, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll index b1db04a7fd886..0f8560c1d7628 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.ll @@ -154,3 +154,27 @@ define float @fold_fmul_distributive(float %x, float %y) { %fmul = fmul contract float %fadd, %x ret float %fmul } + +; test to make sure contract is not dropped such that we can generate fma from following mul/add +define amdgpu_kernel void @vec_mul_scalar_add_fma(<2 x float> %a, <2 x float> %b, float %c1, ptr addrspace(1) %inptr) { +; GFX906-LABEL: vec_mul_scalar_add_fma: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: s_load_dword s5, s[0:1], 0x34 +; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: v_mov_b32_e32 v1, s6 +; GFX906-NEXT: v_mul_f32_e32 v1, s4, v1 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: v_add_f32_e32 v1, s5, v1 +; GFX906-NEXT: global_store_dword v0, v1, s[2:3] offset:4 +; GFX906-NEXT: s_endpgm + %gep = getelementptr float, ptr addrspace(1) %inptr, i32 1 + %c = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> zeroinitializer + %mul = fmul contract <2 x float> %c, %b + %elv = extractelement <2 x float> %mul, i64 0 + %add = fadd contract float %elv, %c1 + store float %add, ptr addrspace(1) %gep, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index e324b27f3f4ba..592be64e1e314 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -3794,14 +3794,14 @@ define half @v_fneg_round_f16(half %a) #0 { ; SI-SAFE: ; %bb.0: ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SAFE-NEXT: s_brev_b32 s4, -2 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_trunc_f32_e32 v2, v0 -; SI-SAFE-NEXT: v_bfi_b32 v1, s4, 1.0, v0 -; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v2 -; SI-SAFE-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 -; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-SAFE-NEXT: v_add_f32_e32 v0, v2, v0 +; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0 +; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1 +; SI-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; SI-SAFE-NEXT: s_brev_b32 s4, -2 +; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0 +; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -3809,56 +3809,55 @@ define half @v_fneg_round_f16(half %a) #0 { ; SI-NSZ: ; %bb.0: ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NSZ-NEXT: s_brev_b32 s4, -2 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_trunc_f32_e32 v2, v0 -; SI-NSZ-NEXT: v_bfi_b32 v1, s4, 1.0, v0 -; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v2 -; SI-NSZ-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v2, v0 +; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0 +; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1 +; SI-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; SI-NSZ-NEXT: s_brev_b32 s4, -2 +; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0 +; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: v_fneg_round_f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SAFE-NEXT: v_trunc_f16_e32 v1, v0 +; VI-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-SAFE-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-SAFE-NEXT: s_movk_i32 s4, 0x7fff -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x3c00 -; VI-SAFE-NEXT: v_trunc_f16_e32 v2, v0 -; VI-SAFE-NEXT: v_bfi_b32 v1, s4, v1, v0 -; VI-SAFE-NEXT: v_sub_f16_e32 v0, v0, v2 -; VI-SAFE-NEXT: v_cmp_ge_f16_e64 vcc, |v0|, 0.5 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-SAFE-NEXT: v_add_f16_e32 v0, v2, v0 +; VI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0 +; VI-SAFE-NEXT: v_add_f16_e32 v0, v1, v0 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: v_fneg_round_f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NSZ-NEXT: v_trunc_f16_e32 v1, v0 +; VI-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1 +; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NSZ-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; VI-NSZ-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NSZ-NEXT: s_movk_i32 s4, 0x7fff -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x3c00 -; VI-NSZ-NEXT: v_trunc_f16_e32 v2, v0 -; VI-NSZ-NEXT: v_bfi_b32 v1, s4, v1, v0 -; VI-NSZ-NEXT: v_sub_f16_e32 v0, v0, v2 -; VI-NSZ-NEXT: v_cmp_ge_f16_e64 vcc, |v0|, 0.5 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v2, v0 +; VI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0 +; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-LABEL: v_fneg_round_f16: ; GFX11-SAFE: ; %bb.0: ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SAFE-NEXT: v_trunc_f16_e32 v1, v0 -; GFX11-SAFE-NEXT: s_movk_i32 s0, 0x3c00 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1 -; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v2|, 0.5 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 +; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] ; @@ -3866,13 +3865,13 @@ define half @v_fneg_round_f16(half %a) #0 { ; GFX11-NSZ: ; %bb.0: ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NSZ-NEXT: v_trunc_f16_e32 v1, v0 -; GFX11-NSZ-NEXT: s_movk_i32 s0, 0x3c00 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1 -; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v2|, 0.5 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0 +; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 +; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] %round = call half @llvm.round.f16(half %a) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index f78b302761ed2..203c6223e762f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2207,26 +2207,26 @@ define float @v_fneg_round_f32(float %a) #0 { ; GCN-SAFE-LABEL: v_fneg_round_f32: ; GCN-SAFE: ; %bb.0: ; GCN-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-SAFE-NEXT: v_trunc_f32_e32 v1, v0 +; GCN-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1 +; GCN-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GCN-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] ; GCN-SAFE-NEXT: s_brev_b32 s4, -2 -; GCN-SAFE-NEXT: v_trunc_f32_e32 v2, v0 -; GCN-SAFE-NEXT: v_bfi_b32 v1, s4, 1.0, v0 -; GCN-SAFE-NEXT: v_sub_f32_e32 v0, v0, v2 -; GCN-SAFE-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 -; GCN-SAFE-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-SAFE-NEXT: v_add_f32_e32 v0, v2, v0 +; GCN-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GCN-SAFE-NEXT: v_add_f32_e32 v0, v1, v0 ; GCN-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-NSZ-LABEL: v_fneg_round_f32: ; GCN-NSZ: ; %bb.0: ; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NSZ-NEXT: v_trunc_f32_e32 v1, v0 +; GCN-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1 +; GCN-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GCN-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] ; GCN-NSZ-NEXT: s_brev_b32 s4, -2 -; GCN-NSZ-NEXT: v_trunc_f32_e32 v2, v0 -; GCN-NSZ-NEXT: v_bfi_b32 v1, s4, 1.0, v0 -; GCN-NSZ-NEXT: v_sub_f32_e32 v0, v0, v2 -; GCN-NSZ-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 -; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NSZ-NEXT: v_sub_f32_e64 v0, -v2, v0 +; GCN-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GCN-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0 ; GCN-NSZ-NEXT: s_setpc_b64 s[30:31] %round = call float @llvm.round.f32(float %a) %fneg = fneg float %round diff --git a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll index 79e23df3eed0e..cb3f1a3e77c69 100644 --- a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll +++ b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll @@ -1,141 +1,141 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s -; RUN: llc -march=amdgcn -mcpu=gfx941 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s -; RUN: llc -march=amdgcn -mcpu=gfx942 -verify-machineinstrs -mattr=-forcestoresc1 < %s | FileCheck --check-prefixes=GCN,NOSC0SC1 %s - -define amdgpu_kernel void @store_global(ptr addrspace(1) %ptr) { -; FORCESC0SC1-LABEL: store_global: -; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 -; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; FORCESC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_global: -; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0 -; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 -; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; NOSC0SC1-NEXT: global_store_dword v0, v1, s[0:1] -; NOSC0SC1-NEXT: s_endpgm -entry: - store float 1.000000e+00, ptr addrspace(1) %ptr, align 4 - ret void -} - -define amdgpu_kernel void @store_flat(ptr addrspace(0) %ptr) { -; FORCESC0SC1-LABEL: store_flat: -; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v2, 1.0 -; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; FORCESC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; FORCESC0SC1-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_flat: -; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOSC0SC1-NEXT: v_mov_b32_e32 v2, 1.0 -; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; NOSC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; NOSC0SC1-NEXT: flat_store_dword v[0:1], v2 -; NOSC0SC1-NEXT: s_endpgm -entry: - store float 1.000000e+00, ptr addrspace(0) %ptr, align 4 - ret void -} - -define amdgpu_kernel void @store_lds(ptr addrspace(3) %ptr) { -; GCN-LABEL: store_lds: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: ds_write_b32 v1, v0 -; GCN-NEXT: s_endpgm -entry: - store float 1.000000e+00, ptr addrspace(3) %ptr, align 4 - ret void -} - -define amdgpu_kernel void @store_scratch(ptr addrspace(5) %ptr) { -; FORCESC0SC1-LABEL: store_scratch: -; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 1.0 -; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; FORCESC0SC1-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_scratch: -; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24 -; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 1.0 -; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; NOSC0SC1-NEXT: scratch_store_dword off, v0, s0 -; NOSC0SC1-NEXT: s_endpgm -entry: - store float 1.000000e+00, ptr addrspace(5) %ptr, align 4 - ret void -} - -define amdgpu_ps void @store_buffer(<4 x i32> inreg %rsrc, float %data, i32 %index) { -; FORCESC0SC1-LABEL: store_buffer: -; FORCESC0SC1: ; %bb.0: ; %main_body -; FORCESC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_buffer: -; NOSC0SC1: ; %bb.0: ; %main_body -; NOSC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen -; NOSC0SC1-NEXT: s_endpgm -main_body: - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -define amdgpu_kernel void @store_global_atomic(ptr addrspace(1) %ptr) { -; FORCESC0SC1-LABEL: store_global_atomic: -; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 -; FORCESC0SC1-NEXT: buffer_wbl2 sc1 -; FORCESC0SC1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FORCESC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_global_atomic: -; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0 -; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 -; NOSC0SC1-NEXT: buffer_wbl2 sc1 -; NOSC0SC1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; NOSC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc1 -; NOSC0SC1-NEXT: s_endpgm -entry: - store atomic float 1.000000e+00, ptr addrspace(1) %ptr syncscope("agent-one-as") seq_cst, align 4 - ret void -} - -define amdgpu_kernel void @store_global_atomic_system(ptr addrspace(1) %ptr) { -; GCN-LABEL: store_global_atomic_system: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 -; GCN-NEXT: s_endpgm - store atomic float 1.000000e+00, ptr addrspace(1) %ptr monotonic, align 4 - ret void -} - - -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s +; RUN: llc -march=amdgcn -mcpu=gfx941 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s +; RUN: llc -march=amdgcn -mcpu=gfx942 -verify-machineinstrs -mattr=-forcestoresc1 < %s | FileCheck --check-prefixes=GCN,NOSC0SC1 %s + +define amdgpu_kernel void @store_global(ptr addrspace(1) %ptr) { +; FORCESC0SC1-LABEL: store_global: +; FORCESC0SC1: ; %bb.0: ; %entry +; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0 +; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 +; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) +; FORCESC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; FORCESC0SC1-NEXT: s_endpgm +; +; NOSC0SC1-LABEL: store_global: +; NOSC0SC1: ; %bb.0: ; %entry +; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0 +; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 +; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) +; NOSC0SC1-NEXT: global_store_dword v0, v1, s[0:1] +; NOSC0SC1-NEXT: s_endpgm +entry: + store float 1.000000e+00, ptr addrspace(1) %ptr, align 4 + ret void +} + +define amdgpu_kernel void @store_flat(ptr addrspace(0) %ptr) { +; FORCESC0SC1-LABEL: store_flat: +; FORCESC0SC1: ; %bb.0: ; %entry +; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FORCESC0SC1-NEXT: v_mov_b32_e32 v2, 1.0 +; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) +; FORCESC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; FORCESC0SC1-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; FORCESC0SC1-NEXT: s_endpgm +; +; NOSC0SC1-LABEL: store_flat: +; NOSC0SC1: ; %bb.0: ; %entry +; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOSC0SC1-NEXT: v_mov_b32_e32 v2, 1.0 +; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) +; NOSC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; NOSC0SC1-NEXT: flat_store_dword v[0:1], v2 +; NOSC0SC1-NEXT: s_endpgm +entry: + store float 1.000000e+00, ptr addrspace(0) %ptr, align 4 + ret void +} + +define amdgpu_kernel void @store_lds(ptr addrspace(3) %ptr) { +; GCN-LABEL: store_lds: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: ds_write_b32 v1, v0 +; GCN-NEXT: s_endpgm +entry: + store float 1.000000e+00, ptr addrspace(3) %ptr, align 4 + ret void +} + +define amdgpu_kernel void @store_scratch(ptr addrspace(5) %ptr) { +; FORCESC0SC1-LABEL: store_scratch: +; FORCESC0SC1: ; %bb.0: ; %entry +; FORCESC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24 +; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 1.0 +; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) +; FORCESC0SC1-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 +; FORCESC0SC1-NEXT: s_endpgm +; +; NOSC0SC1-LABEL: store_scratch: +; NOSC0SC1: ; %bb.0: ; %entry +; NOSC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24 +; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 1.0 +; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) +; NOSC0SC1-NEXT: scratch_store_dword off, v0, s0 +; NOSC0SC1-NEXT: s_endpgm +entry: + store float 1.000000e+00, ptr addrspace(5) %ptr, align 4 + ret void +} + +define amdgpu_ps void @store_buffer(<4 x i32> inreg %rsrc, float %data, i32 %index) { +; FORCESC0SC1-LABEL: store_buffer: +; FORCESC0SC1: ; %bb.0: ; %main_body +; FORCESC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen sc0 sc1 +; FORCESC0SC1-NEXT: s_endpgm +; +; NOSC0SC1-LABEL: store_buffer: +; NOSC0SC1: ; %bb.0: ; %main_body +; NOSC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; NOSC0SC1-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +define amdgpu_kernel void @store_global_atomic(ptr addrspace(1) %ptr) { +; FORCESC0SC1-LABEL: store_global_atomic: +; FORCESC0SC1: ; %bb.0: ; %entry +; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0 +; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 +; FORCESC0SC1-NEXT: buffer_wbl2 sc1 +; FORCESC0SC1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FORCESC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; FORCESC0SC1-NEXT: s_endpgm +; +; NOSC0SC1-LABEL: store_global_atomic: +; NOSC0SC1: ; %bb.0: ; %entry +; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0 +; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 +; NOSC0SC1-NEXT: buffer_wbl2 sc1 +; NOSC0SC1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; NOSC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc1 +; NOSC0SC1-NEXT: s_endpgm +entry: + store atomic float 1.000000e+00, ptr addrspace(1) %ptr syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @store_global_atomic_system(ptr addrspace(1) %ptr) { +; GCN-LABEL: store_global_atomic_system: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GCN-NEXT: s_endpgm + store atomic float 1.000000e+00, ptr addrspace(1) %ptr monotonic, align 4 + ret void +} + + +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index e32d5d773058a..76c40f5962c58 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -294,16 +294,16 @@ entry: define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s8, 0x2f800000 ; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_trunc_f32_e32 v0, s3 -; SI-NEXT: v_trunc_f32_e32 v1, s2 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_trunc_f32_e32 v0, s7 +; SI-NEXT: v_trunc_f32_e32 v1, s6 ; SI-NEXT: v_mul_f32_e64 v2, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; SI-NEXT: v_mul_f32_e64 v4, |v1|, s8 @@ -324,7 +324,7 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; SI-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc ; SI-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v6, v5, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index 5703f0771e96d..13e588dffaf5c 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -4316,20 +4316,20 @@ entry: define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-IEEE-NEXT: s_mov_b32 s11, 0xf000 -; SDAG-IEEE-NEXT: s_mov_b32 s10, -1 +; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s7, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s7 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s11, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; SDAG-IEEE-NEXT: s_mov_b32 s8, s4 -; SDAG-IEEE-NEXT: s_mov_b32 s9, s5 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s10, v1 +; SDAG-IEEE-NEXT: s_mov_b32 s4, s8 +; SDAG-IEEE-NEXT: s_mov_b32 s5, s9 ; SDAG-IEEE-NEXT: v_add_i32_e32 v4, vcc, -1, v3 ; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v2 ; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 @@ -4340,8 +4340,8 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s6 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s10 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v0 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 @@ -4359,7 +4359,7 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SDAG-IEEE-NEXT: s_endpgm ; ; GISEL-IEEE-LABEL: elim_redun_check_v2: @@ -4522,20 +4522,20 @@ entry: define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) { ; SDAG-IEEE-LABEL: elim_redun_check_v2_ult: ; SDAG-IEEE: ; %bb.0: ; %entry -; SDAG-IEEE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SDAG-IEEE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v0, 0xf800000 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SDAG-IEEE-NEXT: s_mov_b32 s11, 0xf000 -; SDAG-IEEE-NEXT: s_mov_b32 s10, -1 +; SDAG-IEEE-NEXT: s_mov_b32 s7, 0xf000 +; SDAG-IEEE-NEXT: s_mov_b32 s6, -1 ; SDAG-IEEE-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s7, v1 -; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s7 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v0 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v2, s11, v1 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v3, v2 -; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s6, v1 -; SDAG-IEEE-NEXT: s_mov_b32 s8, s4 -; SDAG-IEEE-NEXT: s_mov_b32 s9, s5 +; SDAG-IEEE-NEXT: v_mul_f32_e32 v1, s10, v1 +; SDAG-IEEE-NEXT: s_mov_b32 s4, s8 +; SDAG-IEEE-NEXT: s_mov_b32 s5, s9 ; SDAG-IEEE-NEXT: v_add_i32_e32 v4, vcc, -1, v3 ; SDAG-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v2 ; SDAG-IEEE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5 @@ -4546,8 +4546,8 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; SDAG-IEEE-NEXT: v_mul_f32_e32 v4, 0x37800000, v3 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s6 -; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s6, v0 +; SDAG-IEEE-NEXT: v_mov_b32_e32 v5, s10 +; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], s10, v0 ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v0, v5, v1, s[0:1] ; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v5, v0 ; SDAG-IEEE-NEXT: v_mov_b32_e32 v4, 0x260 @@ -4565,7 +4565,7 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f ; SDAG-IEEE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; SDAG-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4 ; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SDAG-IEEE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SDAG-IEEE-NEXT: s_endpgm ; ; GISEL-IEEE-LABEL: elim_redun_check_v2_ult: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll index 780d6a8680ff8..8dd73c5ab32fb 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -2565,26 +2565,26 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX10-NEXT: s_clause 0x14 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:152 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:156 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:160 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:136 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:152 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:156 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 ; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:116 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:124 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:120 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:116 @@ -2636,33 +2636,33 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX10-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; GFX10-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(32) -; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:284 -; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:280 -; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:276 -; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:272 -; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:268 -; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:264 -; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:260 -; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:256 +; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:284 +; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:280 +; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:276 +; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:272 +; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:268 +; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:264 +; GFX10-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:260 +; GFX10-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:256 ; GFX10-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:252 ; GFX10-NEXT: s_waitcnt vmcnt(24) -; GFX10-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:248 -; GFX10-NEXT: buffer_store_dword v55, v0, s[0:3], 0 offen offset:244 -; GFX10-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen offset:240 +; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:248 +; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:244 +; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:240 ; GFX10-NEXT: buffer_store_dword v53, v0, s[0:3], 0 offen offset:236 -; GFX10-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen offset:232 -; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:228 -; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:224 -; GFX10-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:220 +; GFX10-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen offset:232 +; GFX10-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:228 +; GFX10-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:224 +; GFX10-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:220 ; GFX10-NEXT: s_waitcnt vmcnt(16) ; GFX10-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:216 ; GFX10-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:212 ; GFX10-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:208 ; GFX10-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:204 -; GFX10-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen offset:200 -; GFX10-NEXT: buffer_store_dword v43, v0, s[0:3], 0 offen offset:196 -; GFX10-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen offset:192 -; GFX10-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:188 +; GFX10-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen offset:200 +; GFX10-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:196 +; GFX10-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:192 +; GFX10-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:188 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:184 ; GFX10-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:180 @@ -2697,7 +2697,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-LABEL: return_72xi32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xe +; GFX11-NEXT: s_clause 0x10 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:220 ; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:216 ; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:212 @@ -2705,47 +2705,59 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:204 ; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:200 ; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:196 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:192 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:188 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:184 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:180 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:176 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:172 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:168 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:164 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:192 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:188 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:184 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:180 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:176 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:172 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:168 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:164 +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s32 offset:224 +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s32 offset:240 ; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:160 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:156 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:152 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:148 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:144 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:140 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:136 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:132 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:160 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:156 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:152 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:148 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:144 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:140 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:136 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:132 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:60 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:104 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:72 +; GFX11-NEXT: v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v26, v23 +; GFX11-NEXT: v_dual_mov_b32 v25, v22 :: v_dual_mov_b32 v24, v21 ; GFX11-NEXT: s_add_i32 s1, s0, 0x110 ; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v1, off, s32 offset:88 +; GFX11-NEXT: v_dual_mov_b32 v23, v20 :: v_dual_mov_b32 v22, v19 +; GFX11-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v17 +; GFX11-NEXT: v_dual_mov_b32 v19, v16 :: v_dual_mov_b32 v18, v15 +; GFX11-NEXT: v_dual_mov_b32 v17, v14 :: v_dual_mov_b32 v16, v13 +; GFX11-NEXT: v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v14, v11 +; GFX11-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v12, v9 +; GFX11-NEXT: v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v10, v7 +; GFX11-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v7, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v6, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:104 ; GFX11-NEXT: s_add_i32 s2, s0, 0xe0 ; GFX11-NEXT: s_add_i32 s3, s0, 0xd0 ; GFX11-NEXT: s_add_i32 s34, s0, 0xc0 @@ -2760,61 +2772,59 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { ; GFX11-NEXT: s_add_i32 s43, s0, 48 ; GFX11-NEXT: s_add_i32 s44, s0, 32 ; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s1 +; GFX11-NEXT: scratch_store_b128 off, v[45:48], s1 ; GFX11-NEXT: s_add_i32 s1, s0, 0x100 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:108 ; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: scratch_store_b128 off, v[36:39], s1 -; GFX11-NEXT: s_clause 0xb -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 +; GFX11-NEXT: scratch_store_b128 off, v[56:59], s1 +; GFX11-NEXT: s_clause 0xc +; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v1, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v4, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b128 v[28:31], off, s32 offset:224 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 ; GFX11-NEXT: s_add_i32 s1, s0, 0xf0 ; GFX11-NEXT: s_add_i32 s0, s0, 16 -; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: scratch_store_b128 off, v[33:36], s1 -; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: scratch_store_b128 off, v[60:63], s2 -; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: scratch_store_b128 off, v[0:3], s3 -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: scratch_store_b128 off, v[56:59], s34 -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: scratch_store_b128 off, v[43:46], s35 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: scratch_store_b128 off, v[39:42], s36 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: scratch_store_b128 off, v[52:55], s37 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: scratch_store_b128 off, v[48:51], s38 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[29:32], s39 -; GFX11-NEXT: scratch_store_b128 off, v[25:28], s40 -; GFX11-NEXT: scratch_store_b128 off, v[21:24], s41 -; GFX11-NEXT: scratch_store_b128 off, v[17:20], s42 -; GFX11-NEXT: scratch_store_b128 off, v[13:16], s43 -; GFX11-NEXT: scratch_store_b128 off, v[9:12], s44 -; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s32 offset:224 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s1 +; GFX11-NEXT: scratch_store_b128 off, v[59:62], s2 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s3 +; GFX11-NEXT: scratch_store_b128 off, v[41:44], s34 +; GFX11-NEXT: scratch_store_b128 off, v[37:40], s35 +; GFX11-NEXT: scratch_store_b128 off, v[52:55], s36 +; GFX11-NEXT: scratch_store_b128 off, v[48:51], s37 +; GFX11-NEXT: scratch_store_b128 off, v[33:36], s38 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 offset:224 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s39 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 offset:240 ; 16-byte Folded Reload +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s40 +; GFX11-NEXT: scratch_store_b128 off, v[24:27], s41 +; GFX11-NEXT: scratch_store_b128 off, v[20:23], s42 +; GFX11-NEXT: scratch_store_b128 off, v[16:19], s43 +; GFX11-NEXT: scratch_store_b128 off, v[12:15], s44 +; GFX11-NEXT: scratch_store_b128 off, v[8:11], s0 ; GFX11-NEXT: s_clause 0xe -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:164 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:168 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:172 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:176 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:180 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:184 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:188 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:192 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:164 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:168 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:172 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:176 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:180 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:184 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:188 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:192 ; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:196 ; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:200 ; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:204 @@ -2977,14 +2987,29 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:788 ; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:792 ; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:516 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:520 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:524 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:528 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:532 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:536 -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:540 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:516 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:528 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:532 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:536 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:540 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:544 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 ; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:556 @@ -2999,30 +3024,15 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:592 ; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:596 ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:600 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:604 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:608 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:612 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:616 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:620 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:624 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:628 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:604 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:608 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:612 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:616 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:620 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:624 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:628 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 @@ -3065,21 +3075,13 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 -; GFX9-NEXT: v_mov_b32_e32 v2, v24 -; GFX9-NEXT: v_mov_b32_e32 v3, v25 -; GFX9-NEXT: v_mov_b32_e32 v4, v26 -; GFX9-NEXT: v_mov_b32_e32 v5, v27 -; GFX9-NEXT: v_mov_b32_e32 v6, v28 -; GFX9-NEXT: v_mov_b32_e32 v7, v29 -; GFX9-NEXT: v_mov_b32_e32 v8, v30 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 @@ -3396,8 +3398,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_add_i32 s33, s32, 0x1ff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v32, s33 offset:1536 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3407,7 +3409,8 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_addk_i32 s32, 0xa00 -; GFX11-NEXT: s_clause 0xe +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:60 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:56 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:52 ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:48 @@ -3447,7 +3450,7 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: s_load_b64 s[46:47], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s3, s32, 16 ; GFX11-NEXT: s_add_i32 s0, s33, 0x200 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v32, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s2 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s3 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 @@ -3466,112 +3469,109 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0 ; GFX11-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0 ; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v32, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b128 v[45:48], off, s33 offset:624 -; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:640 -; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_mov_b32_e32 v32, v48 -; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: s_clause 0xb +; GFX11-NEXT: scratch_load_b128 v[33:36], off, s33 offset:624 +; GFX11-NEXT: scratch_load_b128 v[26:29], off, s33 offset:640 ; GFX11-NEXT: scratch_load_b128 v[48:51], off, s33 offset:656 ; GFX11-NEXT: scratch_load_b128 v[52:55], off, s33 offset:672 -; GFX11-NEXT: scratch_load_b128 v[41:44], off, s33 offset:688 -; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:704 -; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:720 -; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:736 +; GFX11-NEXT: scratch_load_b128 v[40:43], off, s33 offset:688 +; GFX11-NEXT: scratch_load_b128 v[44:47], off, s33 offset:704 +; GFX11-NEXT: scratch_load_b128 v[56:59], off, s33 offset:720 +; GFX11-NEXT: scratch_load_b128 v[60:63], off, s33 offset:736 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, s33 offset:752 ; GFX11-NEXT: scratch_load_b128 v[4:7], off, s33 offset:768 ; GFX11-NEXT: scratch_load_b128 v[8:11], off, s33 offset:784 ; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:512 -; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52 +; GFX11-NEXT: s_add_i32 s0, s32, 0xa0 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: v_dual_mov_b32 v31, v50 :: v_dual_mov_b32 v30, v49 ; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44 -; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57 -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v16 +; GFX11-NEXT: v_dual_mov_b32 v49, v40 :: v_dual_mov_b32 v50, v41 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_dual_mov_b32 v41, v56 :: v_dual_mov_b32 v40, v47 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_dual_mov_b32 v47, v2 :: v_dual_mov_b32 v2, v5 +; GFX11-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v27 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:528 -; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 -; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:560 -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:576 -; GFX11-NEXT: v_mov_b32_e32 v56, v63 -; GFX11-NEXT: v_mov_b32_e32 v16, v19 -; GFX11-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v2 -; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 -; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8 +; GFX11-NEXT: scratch_load_b128 v[16:19], off, s33 offset:544 +; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:560 +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:576 +; GFX11-NEXT: v_dual_mov_b32 v39, v28 :: v_dual_mov_b32 v28, v29 +; GFX11-NEXT: v_dual_mov_b32 v29, v48 :: v_dual_mov_b32 v48, v55 +; GFX11-NEXT: v_dual_mov_b32 v55, v46 :: v_dual_mov_b32 v46, v1 +; GFX11-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, v7 +; GFX11-NEXT: v_mov_b32_e32 v5, v8 +; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v56, v59 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v15 +; GFX11-NEXT: v_mov_b32_e32 v8, v15 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_dual_mov_b32 v10, v21 :: v_dual_mov_b32 v15, v26 +; GFX11-NEXT: v_dual_mov_b32 v10, v17 :: v_dual_mov_b32 v15, v22 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 +; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1572 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 +; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1556 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_load_b128 v[24:27], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 -; GFX11-NEXT: v_mov_b32_e32 v32, v36 -; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 -; GFX11-NEXT: v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51 -; GFX11-NEXT: v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41 -; GFX11-NEXT: v_mov_b32_e32 v50, v42 -; GFX11-NEXT: v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v17 -; GFX11-NEXT: v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v0, v3 -; GFX11-NEXT: v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9 +; GFX11-NEXT: scratch_store_b128 off, v[24:27], s33 offset:1540 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[36:39], s32 +; GFX11-NEXT: v_dual_mov_b32 v37, v52 :: v_dual_mov_b32 v38, v53 +; GFX11-NEXT: v_mov_b32_e32 v39, v54 +; GFX11-NEXT: v_dual_mov_b32 v53, v44 :: v_dual_mov_b32 v54, v45 +; GFX11-NEXT: v_dual_mov_b32 v44, v63 :: v_dual_mov_b32 v45, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v6 +; GFX11-NEXT: v_mov_b32_e32 v6, v9 ; GFX11-NEXT: scratch_store_b32 off, v11, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x90 -; GFX11-NEXT: v_mov_b32_e32 v51, v43 -; GFX11-NEXT: v_mov_b32_e32 v41, v59 +; GFX11-NEXT: v_dual_mov_b32 v36, v51 :: v_dual_mov_b32 v51, v42 +; GFX11-NEXT: v_mov_b32_e32 v52, v43 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 -; GFX11-NEXT: v_mov_b32_e32 v7, v14 ; GFX11-NEXT: s_add_i32 s0, s32, 0x80 -; GFX11-NEXT: v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61 +; GFX11-NEXT: v_mov_b32_e32 v42, v57 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 -; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20 +; GFX11-NEXT: v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v5, v12 ; GFX11-NEXT: s_add_i32 s0, s32, 0x70 -; GFX11-NEXT: v_mov_b32_e32 v5, v12 -; GFX11-NEXT: scratch_store_b128 off, v[16:19], s0 +; GFX11-NEXT: v_mov_b32_e32 v43, v58 +; GFX11-NEXT: v_dual_mov_b32 v57, v60 :: v_dual_mov_b32 v58, v61 +; GFX11-NEXT: scratch_store_b128 off, v[44:47], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x6c -; GFX11-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v11, v22 +; GFX11-NEXT: v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v7, v14 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x60 -; GFX11-NEXT: v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45 +; GFX11-NEXT: v_mov_b32_e32 v9, v16 ; GFX11-NEXT: scratch_store_b96 off, v[56:58], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 0x50 -; GFX11-NEXT: v_mov_b32_e32 v13, v24 -; GFX11-NEXT: scratch_store_b128 off, v[41:44], s0 +; GFX11-NEXT: v_mov_b32_e32 v11, v18 +; GFX11-NEXT: scratch_store_b128 off, v[40:43], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47 +; GFX11-NEXT: v_dual_mov_b32 v12, v19 :: v_dual_mov_b32 v13, v20 ; GFX11-NEXT: scratch_store_b128 off, v[52:55], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 48 -; GFX11-NEXT: v_mov_b32_e32 v16, v27 +; GFX11-NEXT: v_mov_b32_e32 v14, v21 ; GFX11-NEXT: scratch_store_b128 off, v[48:51], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 32 -; GFX11-NEXT: v_mov_b32_e32 v30, v46 +; GFX11-NEXT: v_mov_b32_e32 v16, v23 ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 -; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 42 -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s0 +; GFX11-NEXT: v_mov_b32_e32 v29, v33 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572 ; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556 ; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540 ; GFX11-NEXT: s_add_i32 s0, s33, 0x400 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v30, v34 :: v_dual_mov_b32 v31, v35 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 42 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[46:47] -; GFX11-NEXT: s_clause 0xe +; GFX11-NEXT: s_clause 0xf ; GFX11-NEXT: scratch_load_b32 v63, off, s33 ; GFX11-NEXT: scratch_load_b32 v62, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v61, off, s33 offset:8 @@ -3587,10 +3587,11 @@ define amdgpu_gfx void @call_72xi32() #1 { ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:48 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:52 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:56 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:60 +; GFX11-NEXT: v_readlane_b32 s31, v32, 1 +; GFX11-NEXT: v_readlane_b32 s30, v32, 0 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v32, off, s33 offset:1536 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 ; GFX11-NEXT: s_mov_b32 s33, s45 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 3344b537ce28f..a6d56df00c862 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4706,27 +4706,27 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s6, s0, 16 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: .LBB73_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_max_i32_e32 v0, s4, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB73_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 @@ -5845,27 +5845,27 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s6, s0, 16 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: .LBB85_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_max_u32_e32 v0, s4, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 @@ -7593,27 +7593,27 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dword s5, s[0:1], 0x10 -; VI-NEXT: s_add_u32 s6, s0, 16 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: .LBB104_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_min_i32_e32 v0, s4, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index e3e28e9486aa4..005cfe73671bd 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -5053,9 +5053,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; VI-NEXT: s_add_u32 s6, s0, 32 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 @@ -5066,8 +5066,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5075,11 +5075,11 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB73_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -6375,9 +6375,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; VI-NEXT: s_add_u32 s6, s0, 32 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 @@ -6388,8 +6388,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6397,11 +6397,11 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB85_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -8416,9 +8416,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_add_u32 s0, s0, s6 ; VI-NEXT: s_addc_u32 s1, s1, s7 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20 -; VI-NEXT: s_add_u32 s6, s0, 32 -; VI-NEXT: s_addc_u32 s7, s1, 0 -; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_mov_b64 s[6:7], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 @@ -8429,8 +8429,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8438,11 +8438,11 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] ; VI-NEXT: s_cbranch_execnz .LBB104_1 ; VI-NEXT: ; %bb.2: ; %atomicrmw.end -; VI-NEXT: s_or_b64 exec, exec, s[0:1] +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 4371eb6c3ee92..e2d55990473c0 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1856,92 +1856,92 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: s_add_u32 s2, s0, 48 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_mov_b32_e32 v13, s2 ; CI-NEXT: s_add_u32 s2, s0, 32 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_mov_b32_e32 v15, s2 ; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v19, s3 -; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v18, s3 +; CI-NEXT: v_mov_b32_e32 v17, s2 ; CI-NEXT: s_add_u32 s2, s0, 0x70 ; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: v_mov_b32_e32 v12, s1 +; CI-NEXT: v_mov_b32_e32 v11, s0 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] ; CI-NEXT: s_nop 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: v_mov_b32_e32 v15, s3 -; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v19 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] +; CI-NEXT: v_mov_b32_e32 v16, s3 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; CI-NEXT: v_mov_b32_e32 v15, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x50 +; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] +; CI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; CI-NEXT: v_mov_b32_e32 v14, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x60 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; CI-NEXT: s_addc_u32 s3, s1, 0 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v5 -; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; CI-NEXT: v_mov_b32_e32 v17, s3 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] +; CI-NEXT: v_cvt_f32_f16_e32 v12, v18 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; CI-NEXT: v_mov_b32_e32 v16, s2 -; CI-NEXT: s_add_u32 s2, s0, 0x50 ; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; CI-NEXT: s_add_u32 s0, s0, 64 -; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v17 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; CI-NEXT: v_mov_b32_e32 v21, s3 +; CI-NEXT: v_mov_b32_e32 v20, s3 ; CI-NEXT: v_mov_b32_e32 v13, s1 -; CI-NEXT: v_mov_b32_e32 v20, s2 +; CI-NEXT: v_mov_b32_e32 v19, s2 ; CI-NEXT: v_mov_b32_e32 v12, s0 -; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; CI-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; CI-NEXT: s_endpgm ; @@ -1951,12 +1951,12 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 48 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 @@ -1984,41 +1984,41 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-NEXT: s_add_u32 s0, s0, 0x60 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v22, v0 -; VI-NEXT: v_cvt_f32_f16_sdwa v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v24, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v20, v2 -; VI-NEXT: v_cvt_f32_f16_sdwa v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; VI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; VI-NEXT: v_cvt_f32_f16_sdwa v23, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v24, v5 +; VI-NEXT: v_cvt_f32_f16_sdwa v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v20, v6 +; VI-NEXT: v_cvt_f32_f16_sdwa v21, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v31, v5 -; VI-NEXT: v_cvt_f32_f16_sdwa v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v26, v6 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; VI-NEXT: v_cvt_f32_f16_sdwa v27, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v26, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v29, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v31, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; VI-NEXT: v_cvt_f32_f16_sdwa v28, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v29, v4 -; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; VI-NEXT: v_cvt_f32_f16_sdwa v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v30 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 ; VI-NEXT: v_mov_b32_e32 v21, s3 ; VI-NEXT: v_mov_b32_e32 v23, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index fcbb351277707..8c53d2671de3f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2712,12 +2712,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 @@ -2732,12 +2732,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 +; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v9 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 @@ -2751,8 +2751,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 +; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 @@ -2760,38 +2760,38 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX9-NEXT: v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX9-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v2, v7, v4 +; GFX9-NEXT: v_add_u16_e32 v3, v7, v3 +; GFX9-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v6 ; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 -; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-NEXT: global_store_byte v3, v0, s[2:3] +; GFX9-NEXT: v_mad_legacy_u16 v1, v16, v18, v1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: @@ -2804,12 +2804,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 @@ -2824,12 +2824,12 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 @@ -2843,8 +2843,8 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 @@ -2852,38 +2852,38 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v4, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-DL-NEXT: v_or_b32_sdwa v2, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v1 +; GFX9-DL-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v2, v7, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v7, v3 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v6 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v16, v18, v1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v15, v17, v1 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 1eebc8e7953e3..b2ba065a079f9 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -1555,15 +1555,15 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v1 -; GCN-NEXT: v_readfirstlane_b32 s7, v2 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-NEXT: v_readfirstlane_b32 s9, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] +; GCN-NEXT: s_xor_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz .LBB8_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 84a0cc6c9220a..f5d41b246b1b8 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -109,7 +109,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28 ; GFX11-NEXT: s_mul_i32 s3, s29, s28 @@ -127,26 +127,26 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s2, s24, s2 ; GFX11-NEXT: s_lshl_b64 s[20:21], s[2:3], 1 -; GFX11-NEXT: global_load_u16 v2, v1, s[20:21] +; GFX11-NEXT: global_load_u16 v1, v2, s[20:21] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB2_6: ; %bb18 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_cmp_ne_u16_e64 s2, s3, 0 -; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_and_b32 vcc_lo, s8, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, v2, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX11-NEXT: s_mov_b32 vcc_lo, 0 ; GFX11-NEXT: v_readfirstlane_b32 s2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11-NEXT: s_bitcmp1_b32 s2, 0 ; GFX11-NEXT: s_cselect_b32 s2, 0x100, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 4fa5b6cf843c1..f98b41ba199bd 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2798,80 +2798,80 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s2 +; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0 -; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 +; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 14 -; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 +; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 12 -; GFX9-NEXT: v_perm_b32 v0, v10, v0, s2 +; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 13 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 10 -; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 +; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 11 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 8 -; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 +; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 9 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX9-NEXT: v_perm_b32 v5, v10, v5, s2 -; GFX9-NEXT: v_perm_b32 v4, v9, v4, s2 -; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 +; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2 +; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16f16_dynamic: @@ -2975,20 +2975,20 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v9, s1 -; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v10, s4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; CI-NEXT: s_cmp_eq_u32 s5, 15 -; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 14 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -2996,109 +2996,109 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 12 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 11 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 10 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 9 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 8 ; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cndmask_b32_e32 v14, v14, v10, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 7 -; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 6 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 5 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 4 -; CI-NEXT: v_or_b32_e32 v3, v3, v11 -; CI-NEXT: v_cndmask_b32_e32 v11, v16, v10, vcc +; CI-NEXT: v_or_b32_e32 v10, v10, v11 +; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; CI-NEXT: v_cndmask_b32_e64 v12, v12, v10, s[2:3] -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 +; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3] ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_or_b32_e32 v6, v6, v11 -; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_or_b32_e32 v2, v2, v11 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; CI-NEXT: s_cmp_eq_u32 s5, 3 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_or_b32_e32 v2, v2, v12 +; CI-NEXT: v_or_b32_e32 v9, v9, v12 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 2 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_or_b32_e32 v0, v0, v12 -; CI-NEXT: v_cndmask_b32_e32 v12, v17, v10, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_or_b32_e32 v7, v7, v12 +; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 1 -; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 ; CI-NEXT: s_cmp_eq_u32 s5, 0 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc +; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc ; CI-NEXT: s_cselect_b64 vcc, -1, 0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; CI-NEXT: v_or_b32_e32 v1, v1, v13 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; CI-NEXT: v_or_b32_e32 v8, v8, v13 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; CI-NEXT: v_or_b32_e32 v5, v5, v10 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; CI-NEXT: v_or_b32_e32 v7, v7, v13 -; CI-NEXT: v_or_b32_e32 v4, v4, v10 -; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; CI-NEXT: s_nop 0 -; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v8 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CI-NEXT: v_or_b32_e32 v1, v1, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; CI-NEXT: v_or_b32_e32 v3, v3, v13 +; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CI-NEXT: s_endpgm ; ; GFX11-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll index cb544732431a6..64948c374e4dd 100644 --- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -455,13 +455,13 @@ define float @v_test_known_not_snan_round_input_fmed3_r_i_i_f32(float %a) #0 { ; GCN-LABEL: v_test_known_not_snan_round_input_fmed3_r_i_i_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_trunc_f32_e32 v1, v0 +; GCN-NEXT: v_sub_f32_e32 v2, v0, v1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] ; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_trunc_f32_e32 v2, v0 -; GCN-NEXT: v_bfi_b32 v1, s4, 1.0, v0 -; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NEXT: v_add_f32_e32 v0, v2, v0 +; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GCN-NEXT: v_add_f32_e32 v0, v1, v0 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %known.not.snan = call float @llvm.round.f32(float %a) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 035903b9b068e..288616086eb8e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -863,68 +863,68 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; GCN-NEXT: ds_read_b128 a[0:3], v1 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; GCN-NEXT: v_add_u32_e32 v0, s0, v1 +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; GCN-NEXT: ds_read_b128 a[0:3], v0 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v1, s1, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:8192 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:112 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:96 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:80 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:64 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:48 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:32 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:16 +; GCN-NEXT: ds_write_b128 v1, a[0:3] +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:24688 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:24672 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:24656 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:24640 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:24624 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:24608 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:24592 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:8288 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:8304 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:8256 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:8272 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:8224 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:8240 +; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:8192 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:8208 +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:24576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -933,47 +933,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:49264 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:49248 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:49232 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:49216 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:49200 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:49184 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:49168 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:49152 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:16480 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:16496 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:16448 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:16464 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:16416 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:16432 +; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:16384 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:16400 +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:49152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; GCN-NEXT: v_add_u32_e32 v0, 0x6000, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:57456 -; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:57440 -; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:57424 -; GCN-NEXT: ds_read_b128 a[16:19], v1 offset:57408 -; GCN-NEXT: ds_read_b128 a[0:3], v1 offset:57344 -; GCN-NEXT: ds_read_b128 a[4:7], v1 offset:57360 -; GCN-NEXT: ds_read_b128 a[8:11], v1 offset:57376 -; GCN-NEXT: ds_read_b128 a[12:15], v1 offset:57392 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:24672 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:24688 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:24640 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:24656 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:24608 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:24624 +; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:24576 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:24592 +; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:57392 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -982,82 +982,82 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: ds_write_b128 v1, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v1, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v1, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v1, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v1, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v1, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v1, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v1, a[4:7] offset:32784 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry ; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:64 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:16 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:32 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:48 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:64 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:16 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:32 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:80 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:64 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:48 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:8304 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:8288 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:8272 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:8256 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:8240 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:8224 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:112 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:96 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:80 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:64 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:48 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:32 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:16 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:8304 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:8288 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:8272 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:8256 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:8240 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:8224 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:8256 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:8272 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:8224 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:8240 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:8192 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:8208 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:24688 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:24672 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:24656 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:24640 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:24624 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:24608 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:8288 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:8304 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:8256 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:8272 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:8224 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:8240 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:8192 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:8208 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:24688 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:24672 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:24656 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:24640 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:24624 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:24608 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:24576 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -1066,47 +1066,47 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:16448 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:16464 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:16416 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:16432 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:16384 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16400 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:49264 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:49248 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:49232 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:49216 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:49200 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:49184 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:49168 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:49152 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:16480 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:16496 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:16448 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:16464 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:16416 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:16432 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:16384 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:16400 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:49264 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:49248 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:49232 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:49216 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:49200 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:49184 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:49168 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:49152 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, 0x6000, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, 0x6000, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 1 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:24640 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:24656 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:24608 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:24624 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:24576 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:24592 -; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:57456 -; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:57440 -; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:57424 -; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v1 offset:57408 -; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v1 offset:57344 -; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v1 offset:57360 -; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v1 offset:57376 -; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v1 offset:57392 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:24672 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:24688 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:24640 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:24656 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:24608 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:24624 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:24576 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:24592 +; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:57456 +; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:57440 +; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:57424 +; EXACTCUTOFF-NEXT: ds_read_b128 a[16:19], v0 offset:57408 +; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:57344 +; EXACTCUTOFF-NEXT: ds_read_b128 a[4:7], v0 offset:57360 +; EXACTCUTOFF-NEXT: ds_read_b128 a[8:11], v0 offset:57376 +; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:57392 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) @@ -1115,14 +1115,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 7 ; EXACTCUTOFF-NEXT: s_nop 2 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[16:19] offset:32832 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[20:23] offset:32848 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32800 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[12:15] offset:32816 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] offset:32768 -; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[24:27] offset:32864 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[28:31] offset:32880 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[16:19] offset:32832 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[20:23] offset:32848 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[8:11] offset:32800 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[12:15] offset:32816 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[0:3] offset:32768 +; EXACTCUTOFF-NEXT: ds_write_b128 v1, a[4:7] offset:32784 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index fa0c723c64e36..0c49338bfcab9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -707,42 +707,42 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_exp2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s0, 0xc2fc0000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x42800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x1f800000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s8 ; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s9 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_add_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[0:1] ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_add_f32_e32 v1, s5, v1 +; SI-GISEL-NEXT: v_add_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_exp_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2fc0000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc ; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 1.0, v4, s[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[0:1] -; SI-GISEL-NEXT: v_add_f32_e32 v5, s6, v5 -; SI-GISEL-NEXT: v_add_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v5, s10, v5 +; SI-GISEL-NEXT: v_add_f32_e32 v2, s11, v2 ; SI-GISEL-NEXT: v_exp_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_exp_f32_e32 v3, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[0:1] ; SI-GISEL-NEXT: v_mul_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: v_mul_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s10, -1 -; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_exp2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index af12c10fec5d6..d499e017e92f4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -865,42 +865,42 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in) ; ; SI-GISEL-LABEL: s_log2_v4f32: ; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd -; SI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; SI-GISEL-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-GISEL-NEXT: s_mov_b32 s0, 0x800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x4f800000 ; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42000000 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s8 ; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, s9 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v3, vcc ; SI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, s8, v0 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[0:1] ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s5, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, s9, v1 ; SI-GISEL-NEXT: v_log_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x800000 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v2 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s7, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s10, v2 +; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s11, v2 ; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v5 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v5, 1.0, v3, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[0:1] -; SI-GISEL-NEXT: v_mul_f32_e32 v5, s6, v5 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, s7, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v5, s10, v5 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, s11, v2 ; SI-GISEL-NEXT: v_log_f32_e32 v5, v5 ; SI-GISEL-NEXT: v_log_f32_e32 v3, v2 ; SI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; SI-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; SI-GISEL-NEXT: v_sub_f32_e32 v2, v5, v2 ; SI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 -; SI-GISEL-NEXT: s_mov_b32 s10, -1 -; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 -; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: s_log2_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 6b013175ff61b..1a51c8708b941 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -9,30 +9,32 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s8, s3, 0xb0014 -; SI-NEXT: s_addk_i32 s8, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; SI-NEXT: s_bfe_u32 s7, s3, 0xb0014 +; SI-NEXT: s_addk_i32 s7, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; SI-NEXT: s_and_b32 s8, s3, 0x80000000 ; SI-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] -; SI-NEXT: s_and_b32 s10, s3, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s10, s5 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: s_cselect_b32 s9, s3, s5 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[0:1]|, 0.5 -; SI-NEXT: s_or_b32 s0, s10, 0x3ff00000 -; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; SI-NEXT: s_cselect_b32 s0, s0, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -40,19 +42,21 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; CI-LABEL: round_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s8, 0 +; CI-NEXT: s_brev_b32 s5, -2 ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s4, s0 ; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] -; CI-NEXT: s_and_b32 s0, s3, 0x80000000 -; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[2:3]|, 0.5 -; CI-NEXT: s_or_b32 s0, s0, 0x3ff00000 -; CI-NEXT: s_and_b64 s[2:3], s[2:3], exec -; CI-NEXT: s_cselect_b32 s9, s0, 0 -; CI-NEXT: v_add_f64 v[0:1], v[0:1], s[8:9] +; CI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v2, s3 +; CI-NEXT: s_and_b64 s[2:3], s[8:9], exec +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: v_bfi_b32 v3, s5, v3, v2 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] ; CI-NEXT: s_mov_b32 s5, s1 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm @@ -75,6 +79,7 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_movk_i32 s4, 0xfc01 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 ; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 @@ -90,11 +95,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 51, v6 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] -; SI-NEXT: v_or_b32_e32 v6, 0x3ff00000, v7 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 ; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 @@ -110,16 +116,17 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_and_b32_e32 v6, 0x80000000, v3 -; CI-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] -; CI-NEXT: v_or_b32_e32 v6, 0x3ff00000, v6 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 ; CI-NEXT: v_mov_b32_e32 v2, v1 -; CI-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -154,12 +161,12 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_or_b32 s3, s12, 0x3ff00000 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_cmp_ge_f64_e64 s[12:13], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 ; SI-NEXT: s_addk_i32 s3, 0xfc01 ; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], s3 @@ -173,13 +180,16 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: s_cselect_b32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[2:3] -; SI-NEXT: s_or_b32 s3, s8, 0x3ff00000 -; SI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 +; SI-NEXT: v_add_f64 v[2:3], s[4:5], -v[2:3] +; SI-NEXT: s_brev_b32 s12, -2 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 +; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; SI-NEXT: s_and_b64 s[8:9], s[8:9], exec +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[2:3], s[10:11], v[0:1] -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 ; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfi_b32 v1, s12, v1, v4 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -189,25 +199,28 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s8, 0 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] -; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: s_and_b32 s2, s7, 0x80000000 -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_add_f64 v[6:7], s[4:5], -v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[2:3], s[6:7] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] +; CI-NEXT: v_add_f64 v[4:5], s[6:7], -v[2:3] +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[4:5]|, 0.5 +; CI-NEXT: v_add_f64 v[4:5], s[4:5], -v[6:7] ; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; CI-NEXT: s_cselect_b32 s9, s2, 0 -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 -; CI-NEXT: s_and_b32 s2, s5, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: s_and_b64 s[4:5], s[6:7], exec -; CI-NEXT: v_add_f64 v[2:3], v[0:1], s[8:9] -; CI-NEXT: s_cselect_b32 s9, s2, 0 -; CI-NEXT: v_add_f64 v[0:1], v[4:5], s[8:9] +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[4:5]|, 0.5 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: v_bfi_b32 v1, s2, v8, v1 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v4, s5 +; CI-NEXT: v_bfi_b32 v1, s2, v1, v4 +; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[0:1] ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -223,7 +236,7 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s13, 0xfffff ; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: s_brev_b32 s18, -2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 ; SI-NEXT: s_addk_i32 s3, 0xfc01 @@ -239,12 +252,12 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: v_mov_b32_e32 v0, s14 ; SI-NEXT: v_mov_b32_e32 v1, s15 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_or_b32 s3, s16, 0x3ff00000 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[16:17], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b64 s[16:17], s[16:17], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 ; SI-NEXT: s_addk_i32 s3, 0xfc01 ; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s3 @@ -255,15 +268,16 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cselect_b32 s7, s16, s7 ; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 ; SI-NEXT: s_cselect_b32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: s_or_b32 s3, s16, 0x3ff00000 -; SI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[2:3], s[14:15], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[16:17], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: s_and_b64 s[14:15], s[16:17], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 ; SI-NEXT: s_addk_i32 s3, 0xfc01 @@ -278,13 +292,13 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[0:1] +; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] +; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 +; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[4:5] -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 -; SI-NEXT: s_or_b32 s3, s14, 0x3ff00000 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_and_b64 s[6:7], s[14:15], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s3 ; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 ; SI-NEXT: s_addk_i32 s3, 0xfc01 ; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s3 @@ -296,15 +310,19 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s6, s8, s6 ; SI-NEXT: s_cselect_b32 s7, s9, s7 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7] +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v6, s7 +; SI-NEXT: v_add_f64 v[6:7], s[8:9], -v[5:6] +; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[4:5] -; SI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 -; SI-NEXT: s_or_b32 s3, s10, 0x3ff00000 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v8, s9 +; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 ; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -315,42 +333,47 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; CI-NEXT: s_mov_b32 s12, 0 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: s_and_b32 s2, s7, 0x80000000 +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s4 ; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_add_f64 v[6:7], s[4:5], -v[4:5] +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] ; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; CI-NEXT: s_cselect_b32 s13, s2, 0 -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 -; CI-NEXT: s_and_b32 s2, s5, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[10:11] -; CI-NEXT: s_and_b64 s[4:5], s[6:7], exec -; CI-NEXT: v_add_f64 v[2:3], v[0:1], s[12:13] -; CI-NEXT: s_cselect_b32 s13, s2, 0 -; CI-NEXT: v_add_f64 v[8:9], s[10:11], -v[6:7] -; CI-NEXT: v_add_f64 v[0:1], v[4:5], s[12:13] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 -; CI-NEXT: s_and_b32 s2, s11, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5] +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v10, s5 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] +; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: s_cselect_b32 s13, s2, 0 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 -; CI-NEXT: s_and_b32 s2, s9, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 +; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[10:11] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v12, s11 ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[6:7], v[6:7], s[12:13] -; CI-NEXT: s_cselect_b32 s13, s2, 0 -; CI-NEXT: v_add_f64 v[4:5], v[4:5], s[12:13] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v8, s9 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v8 +; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -383,152 +406,161 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v0, s22 ; SI-NEXT: v_mov_b32_e32 v1, s23 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] -; SI-NEXT: s_or_b32 s3, s24, 0x3ff00000 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[0:1]|, 0.5 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s3 +; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b64 s[24:25], s[24:25], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 +; SI-NEXT: s_add_i32 s24, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s24 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] -; SI-NEXT: s_and_b32 s24, s5, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s25, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s24, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s24, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s7, s25, s7 +; SI-NEXT: s_cmp_gt_i32 s24, 51 ; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 ; SI-NEXT: s_cselect_b32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] -; SI-NEXT: s_or_b32 s3, s24, 0x3ff00000 -; SI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[2:3], s[22:23], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s11, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s3 +; SI-NEXT: v_cmp_ge_f64_e64 s[24:25], |v[0:1]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: s_and_b64 s[22:23], s[24:25], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 +; SI-NEXT: s_add_i32 s22, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s22 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] -; SI-NEXT: s_and_b32 s22, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s23, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s22, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s22, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s23, s5 +; SI-NEXT: s_cmp_gt_i32 s22, 51 ; SI-NEXT: s_cselect_b32 s4, s10, s4 ; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_add_f64 v[4:5], s[10:11], -v[0:1] +; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] +; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 +; SI-NEXT: v_cmp_ge_f64_e64 s[22:23], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[4:5]|, 0.5 -; SI-NEXT: s_or_b32 s3, s22, 0x3ff00000 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s3 +; SI-NEXT: s_and_b64 s[6:7], s[22:23], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: s_bfe_u32 s6, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_andn2_b64 s[6:7], s[8:9], s[6:7] -; SI-NEXT: s_and_b32 s10, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s11, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s10, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s7, s11, s7 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s6, s8, s6 ; SI-NEXT: s_cselect_b32 s7, s9, s7 ; SI-NEXT: v_mov_b32_e32 v4, s6 ; SI-NEXT: v_mov_b32_e32 v5, s7 ; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[4:5] +; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 -; SI-NEXT: s_or_b32 s3, s10, 0x3ff00000 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s3 +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: s_bfe_u32 s4, s15, 0xb0014 +; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 +; SI-NEXT: v_mov_b32_e32 v10, s9 ; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_and_b32 s8, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s4, s14, s4 ; SI-NEXT: s_cselect_b32 s5, s15, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: v_add_f64 v[10:11], s[14:15], -v[4:5] +; SI-NEXT: v_add_f64 v[4:5], s[14:15], -v[4:5] +; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[4:5]|, 0.5 ; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[10:11]|, 0.5 -; SI-NEXT: s_or_b32 s3, s8, 0x3ff00000 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s3 +; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v12, s6 +; SI-NEXT: s_bfe_u32 s6, s13, 0xb0014 +; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 ; SI-NEXT: s_andn2_b64 s[6:7], s[12:13], s[6:7] -; SI-NEXT: s_and_b32 s8, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s8, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s7, s13, s7 ; SI-NEXT: s_cselect_b32 s6, s12, s6 -; SI-NEXT: v_mov_b32_e32 v11, s7 -; SI-NEXT: v_mov_b32_e32 v10, s6 -; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[10:11] +; SI-NEXT: v_mov_b32_e32 v10, s7 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[9:10] +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 ; SI-NEXT: v_add_f64 v[12:13], s[4:5], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 -; SI-NEXT: s_or_b32 s3, s8, 0x3ff00000 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s19, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s3 +; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: s_bfe_u32 s4, s19, 0xb0014 +; SI-NEXT: s_add_i32 s8, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s8 ; SI-NEXT: s_andn2_b64 s[4:5], s[18:19], s[4:5] -; SI-NEXT: s_and_b32 s8, s19, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s19, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s5, s19, s5 ; SI-NEXT: s_cselect_b32 s4, s18, s4 -; SI-NEXT: v_mov_b32_e32 v11, s5 -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: v_add_f64 v[14:15], s[18:19], -v[10:11] +; SI-NEXT: v_mov_b32_e32 v10, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[9:10] +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 ; SI-NEXT: v_add_f64 v[10:11], s[6:7], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 -; SI-NEXT: s_or_b32 s3, s8, 0x3ff00000 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: s_bfe_u32 s3, s17, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s3 +; SI-NEXT: s_and_b64 s[6:7], s[8:9], exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: s_bfe_u32 s6, s17, 0xb0014 +; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], s8 ; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[6:7] -; SI-NEXT: s_and_b32 s8, s17, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s9, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s6, 0, s6 -; SI-NEXT: s_cselect_b32 s7, s8, s7 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s7, s9, s7 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s7, s17, s7 ; SI-NEXT: s_cselect_b32 s6, s16, s6 ; SI-NEXT: v_mov_b32_e32 v15, s7 ; SI-NEXT: v_mov_b32_e32 v14, s6 ; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[14:15] +; SI-NEXT: v_mov_b32_e32 v16, s19 +; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[14:15]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 ; SI-NEXT: v_add_f64 v[16:17], s[4:5], v[8:9] -; SI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[14:15]|, 0.5 -; SI-NEXT: s_or_b32 s3, s8, 0x3ff00000 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; SI-NEXT: s_cselect_b32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 +; SI-NEXT: s_and_b64 s[4:5], s[8:9], exec +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_mov_b32_e32 v14, s17 +; SI-NEXT: v_bfi_b32 v9, s3, v9, v14 ; SI-NEXT: v_add_f64 v[14:15], s[6:7], v[8:9] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -541,78 +573,87 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; CI-NEXT: s_mov_b32 s20, 0 +; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[4:5] ; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: s_and_b32 s2, s7, 0x80000000 +; CI-NEXT: v_mov_b32_e32 v5, s7 ; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_add_f64 v[6:7], s[4:5], -v[4:5] +; CI-NEXT: v_add_f64 v[2:3], s[4:5], -v[6:7] ; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; CI-NEXT: s_cselect_b32 s21, s2, 0 -; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 -; CI-NEXT: s_and_b32 s2, s5, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[10:11] -; CI-NEXT: s_and_b64 s[4:5], s[6:7], exec -; CI-NEXT: v_add_f64 v[2:3], v[0:1], s[20:21] -; CI-NEXT: s_cselect_b32 s21, s2, 0 -; CI-NEXT: v_add_f64 v[8:9], s[10:11], -v[6:7] -; CI-NEXT: v_add_f64 v[0:1], v[4:5], s[20:21] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 -; CI-NEXT: s_and_b32 s2, s11, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[8:9]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: s_cselect_b32 s21, s2, 0 -; CI-NEXT: s_and_b32 s2, s9, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_add_f64 v[10:11], s[14:15], -v[8:9] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[10:11] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v10, s5 +; CI-NEXT: v_add_f64 v[0:1], s[10:11], -v[8:9] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 +; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_trunc_f64_e32 v[12:13], s[12:13] -; CI-NEXT: v_add_f64 v[6:7], v[6:7], s[20:21] -; CI-NEXT: s_cselect_b32 s21, s2, 0 +; CI-NEXT: v_add_f64 v[10:11], s[8:9], -v[6:7] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 -; CI-NEXT: s_and_b32 s2, s15, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_add_f64 v[14:15], s[12:13], -v[12:13] +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] +; CI-NEXT: v_mov_b32_e32 v12, s11 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[12:13], s[14:15], -v[10:11] +; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v14, s9 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[14:15], s[12:13] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[4:5], v[4:5], s[20:21] -; CI-NEXT: s_cselect_b32 s21, s2, 0 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[14:15]|, 0.5 -; CI-NEXT: s_and_b32 s2, s13, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_trunc_f64_e32 v[14:15], s[18:19] +; CI-NEXT: v_add_f64 v[12:13], s[12:13], -v[14:15] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[12:13]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v16, s15 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v16 ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[10:11], v[8:9], s[20:21] -; CI-NEXT: s_cselect_b32 s21, s2, 0 -; CI-NEXT: v_add_f64 v[16:17], s[18:19], -v[14:15] -; CI-NEXT: v_add_f64 v[8:9], v[12:13], s[20:21] -; CI-NEXT: v_trunc_f64_e32 v[12:13], s[16:17] -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[16:17]|, 0.5 -; CI-NEXT: s_and_b32 s2, s19, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 -; CI-NEXT: v_add_f64 v[16:17], s[16:17], -v[12:13] +; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v18, s13 +; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[16:17] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[10:11]|, 0.5 +; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: s_cselect_b32 s21, s2, 0 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[16:17]|, 0.5 -; CI-NEXT: s_and_b32 s2, s17, 0x80000000 -; CI-NEXT: s_or_b32 s2, s2, 0x3ff00000 +; CI-NEXT: v_add_f64 v[18:19], s[16:17], -v[14:15] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[18:19]|, 0.5 +; CI-NEXT: v_mov_b32_e32 v20, s19 ; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[14:15], v[14:15], s[20:21] -; CI-NEXT: s_cselect_b32 s21, s2, 0 -; CI-NEXT: v_add_f64 v[12:13], v[12:13], s[20:21] +; CI-NEXT: v_bfi_b32 v5, s2, v5, v20 +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_mov_b32_e32 v18, s17 +; CI-NEXT: v_bfi_b32 v5, s2, v5, v18 +; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 1489092aef5ff..2e9197dfd3d9d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -8,37 +8,37 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX6-LABEL: round_f32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb +; GFX6-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_brev_b32 s5, -2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_trunc_f32_e32 v1, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_sub_f32_e32 v2, s4, v1 -; GFX6-NEXT: v_bfi_b32 v0, s5, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX6-NEXT: v_trunc_f32_e32 v0, s6 +; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_brev_b32 s5, -2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_sub_f32_e32 v2, s4, v1 -; GFX8-NEXT: v_bfi_b32 v0, s5, 1.0, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX8-NEXT: s_brev_b32 s4, -2 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -46,17 +46,17 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_brev_b32 s0, -2 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_sub_f32_e32 v2, s2, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, 1.0, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_trunc_f32_e32 v0, s2 +; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -65,16 +65,17 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s2 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, 1.0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, 0.5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 @@ -83,17 +84,16 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { ; ; R600-LABEL: round_f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: ; R600-NEXT: TRUNC * T0.W, KC0[2].Z, ; R600-NEXT: ADD * T1.W, KC0[2].Z, -PV.W, -; R600-NEXT: BFI_INT T2.W, literal.x, 1.0, KC0[2].Z, ; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5, +; R600-NEXT: BFI_INT * T1.W, literal.x, PV.W, KC0[2].Z, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: CNDE * T1.W, PS, 0.0, PV.W, ; R600-NEXT: ADD T0.X, T0.W, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) @@ -114,22 +114,22 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_trunc_f32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 -; GFX6-NEXT: v_sub_f32_e32 v2, s3, v1 -; GFX6-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_trunc_f32_e32 v2, s2 -; GFX6-NEXT: v_add_f32_e32 v1, v1, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_sub_f32_e32 v3, s2, v2 -; GFX6-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX6-NEXT: v_trunc_f32_e32 v0, s3 +; GFX6-NEXT: v_sub_f32_e32 v1, s3, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: v_bfi_b32 v1, s8, v1, v2 +; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX6-NEXT: v_trunc_f32_e32 v0, s2 +; GFX6-NEXT: v_sub_f32_e32 v2, s2, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_bfi_b32 v2, s8, v2, v3 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -140,22 +140,22 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; GFX89-NEXT: s_mov_b32 s7, 0xf000 ; GFX89-NEXT: s_mov_b32 s6, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v1, s3 -; GFX89-NEXT: v_mov_b32_e32 v0, s3 -; GFX89-NEXT: v_sub_f32_e32 v2, s3, v1 -; GFX89-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_trunc_f32_e32 v2, s2 -; GFX89-NEXT: v_add_f32_e32 v1, v1, v0 -; GFX89-NEXT: v_mov_b32_e32 v0, s2 -; GFX89-NEXT: v_sub_f32_e32 v3, s2, v2 -; GFX89-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX89-NEXT: v_trunc_f32_e32 v0, s3 +; GFX89-NEXT: v_sub_f32_e32 v1, s3, v0 ; GFX89-NEXT: s_mov_b32 s4, s0 ; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX89-NEXT: v_mov_b32_e32 v2, s3 +; GFX89-NEXT: v_bfi_b32 v1, s8, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s2 +; GFX89-NEXT: v_sub_f32_e32 v2, s2, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX89-NEXT: v_mov_b32_e32 v3, s2 +; GFX89-NEXT: v_bfi_b32 v2, s8, v2, v3 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; @@ -165,18 +165,23 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f32_e32 v0, s3 ; GFX11-NEXT: v_trunc_f32_e32 v2, s2 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, 1.0, s3 -; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, 1.0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_sub_f32 v1, s3, v0 :: v_dual_sub_f32 v4, s2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f32_e32 v1, s3, v0 +; GFX11-NEXT: v_sub_f32_e32 v3, s2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v1|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_add_f32 v1, v0, v1 :: v_dual_add_f32 v0, v2, v3 +; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -184,25 +189,23 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; R600-LABEL: round_v2f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 15, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: -; R600-NEXT: TRUNC T0.W, KC0[3].X, -; R600-NEXT: TRUNC * T1.W, KC0[2].W, -; R600-NEXT: ADD * T2.W, KC0[3].X, -PV.W, -; R600-NEXT: SETGE T0.Z, |PV.W|, 0.5, -; R600-NEXT: BFI_INT T2.W, literal.x, 1.0, KC0[3].X, -; R600-NEXT: ADD * T3.W, KC0[2].W, -T1.W, +; R600-NEXT: TRUNC * T0.W, KC0[3].X, +; R600-NEXT: ADD T1.W, KC0[3].X, -PV.W, +; R600-NEXT: TRUNC * T2.W, KC0[2].W, +; R600-NEXT: ADD T3.W, KC0[2].W, -PS, +; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5, +; R600-NEXT: BFI_INT T1.W, literal.x, PS, KC0[3].X, +; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: SETGE T1.Z, |PS|, 0.5, -; R600-NEXT: BFI_INT T3.W, literal.x, 1.0, KC0[2].W, -; R600-NEXT: CNDE * T2.W, PV.Z, 0.0, PV.W, BS:VEC_021/SCL_122 +; R600-NEXT: ADD T0.Y, T0.W, PV.W, +; R600-NEXT: BFI_INT * T0.W, literal.x, PS, KC0[2].W, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: ADD T0.Y, T0.W, PS, -; R600-NEXT: CNDE * T0.W, PV.Z, 0.0, PV.W, -; R600-NEXT: ADD T0.X, T1.W, PV.W, +; R600-NEXT: ADD T0.X, T2.W, PV.W, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 @@ -215,38 +218,38 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_brev_b32 s8, -2 +; GFX6-NEXT: s_brev_b32 s10, -2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_trunc_f32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_sub_f32_e32 v2, s7, v1 -; GFX6-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_add_f32_e32 v3, v1, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_sub_f32_e32 v2, s6, v1 -; GFX6-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_add_f32_e32 v2, v1, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_sub_f32_e32 v4, s5, v1 -; GFX6-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_trunc_f32_e32 v4, s4 -; GFX6-NEXT: v_add_f32_e32 v1, v1, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_sub_f32_e32 v5, s4, v4 -; GFX6-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX6-NEXT: v_trunc_f32_e32 v0, s7 +; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX6-NEXT: v_trunc_f32_e32 v0, s6 +; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX6-NEXT: v_trunc_f32_e32 v0, s5 +; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v4, s5 +; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX6-NEXT: v_trunc_f32_e32 v0, s4 +; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -254,38 +257,38 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_brev_b32 s8, -2 +; GFX8-NEXT: s_brev_b32 s10, -2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_trunc_f32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_sub_f32_e32 v2, s7, v1 -; GFX8-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_add_f32_e32 v3, v1, v0 -; GFX8-NEXT: v_trunc_f32_e32 v1, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_sub_f32_e32 v2, s6, v1 -; GFX8-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_add_f32_e32 v2, v1, v0 -; GFX8-NEXT: v_trunc_f32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v0, s5 -; GFX8-NEXT: v_sub_f32_e32 v4, s5, v1 -; GFX8-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_trunc_f32_e32 v4, s4 -; GFX8-NEXT: v_add_f32_e32 v1, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_sub_f32_e32 v5, s4, v4 -; GFX8-NEXT: v_bfi_b32 v0, s8, 1.0, v0 -; GFX8-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX8-NEXT: v_trunc_f32_e32 v0, s7 +; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s6 +; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 +; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s5 +; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 +; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX8-NEXT: v_trunc_f32_e32 v0, s4 +; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -293,38 +296,38 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-NEXT: s_brev_b32 s0, -2 +; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_trunc_f32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_sub_f32_e32 v2, s7, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, 1.0, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_add_f32_e32 v3, v1, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_sub_f32_e32 v2, s6, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, 1.0, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_add_f32_e32 v2, v1, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_sub_f32_e32 v4, s5, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, 1.0, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_trunc_f32_e32 v4, s4 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_sub_f32_e32 v5, s4, v4 -; GFX9-NEXT: v_bfi_b32 v0, s0, 1.0, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX9-NEXT: v_trunc_f32_e32 v0, s7 +; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s6 +; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s5 +; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4 +; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX9-NEXT: v_trunc_f32_e32 v0, s4 +; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; @@ -334,31 +337,35 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v1, s7 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, 1.0, s7 -; GFX11-NEXT: v_trunc_f32_e32 v4, s6 -; GFX11-NEXT: v_trunc_f32_e32 v5, s5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, 1.0, s6 -; GFX11-NEXT: v_sub_f32_e32 v7, s7, v1 -; GFX11-NEXT: v_trunc_f32_e32 v6, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_sub_f32 v9, s6, v4 :: v_dual_sub_f32 v10, s5, v5 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, 1.0, s5 -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, 0.5 -; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, 1.0, s4 -; GFX11-NEXT: v_dual_sub_f32 v11, s4, v6 :: v_dual_cndmask_b32 v0, 0, v0 -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v9|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v10|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v3, vcc_lo -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v11|, 0.5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_add_f32 v3, v1, v0 :: v_dual_add_f32 v2, v4, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v8, 0, v8 :: v_dual_add_f32 v1, v5, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f32_e32 v0, v6, v8 +; GFX11-NEXT: v_trunc_f32_e32 v0, s7 +; GFX11-NEXT: v_trunc_f32_e32 v1, s6 +; GFX11-NEXT: v_trunc_f32_e32 v4, s5 +; GFX11-NEXT: v_trunc_f32_e32 v5, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1 +; GFX11-NEXT: v_dual_sub_f32 v6, s5, v4 :: v_dual_sub_f32 v7, s4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v6|, 0.5 +; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5 +; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s4 +; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -366,40 +373,36 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) # ; ; R600-LABEL: round_v4f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 29, @4, KC0[CB0:0-32], KC1[] -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD ; R600-NEXT: ALU clause starting at 4: ; R600-NEXT: TRUNC * T0.W, KC0[4].X, ; R600-NEXT: ADD T1.W, KC0[4].X, -PV.W, ; R600-NEXT: TRUNC * T2.W, KC0[3].W, -; R600-NEXT: TRUNC T0.Y, KC0[3].Z, -; R600-NEXT: ADD T0.Z, KC0[3].W, -PS, -; R600-NEXT: BFI_INT T3.W, literal.x, 1.0, KC0[4].X, +; R600-NEXT: TRUNC T0.Z, KC0[3].Z, +; R600-NEXT: ADD T3.W, KC0[3].W, -PS, ; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5, +; R600-NEXT: BFI_INT T0.Y, literal.x, PS, KC0[4].X, +; R600-NEXT: SETGE T1.Z, |PV.W|, 0.5, +; R600-NEXT: ADD * T1.W, KC0[3].Z, -PV.Z, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: CNDE T0.X, PS, 0.0, PV.W, -; R600-NEXT: TRUNC T1.Y, KC0[3].Y, -; R600-NEXT: SETGE T0.Z, |PV.Z|, 0.5, -; R600-NEXT: BFI_INT T1.W, literal.x, 1.0, KC0[3].W, -; R600-NEXT: ADD * T3.W, KC0[3].Z, -PV.Y, +; R600-NEXT: TRUNC * T3.W, KC0[3].Y, +; R600-NEXT: ADD T1.Y, KC0[3].Y, -PV.W, +; R600-NEXT: SETGE T2.Z, |T1.W|, 0.5, +; R600-NEXT: BFI_INT T1.W, literal.x, T1.Z, KC0[3].W, +; R600-NEXT: ADD * T4.W, T0.W, T0.Y, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: SETGE T1.X, |PS|, 0.5, -; R600-NEXT: BFI_INT T2.Y, literal.x, 1.0, KC0[3].Z, -; R600-NEXT: CNDE T0.Z, PV.Z, 0.0, PV.W, -; R600-NEXT: ADD T1.W, KC0[3].Y, -PV.Y, -; R600-NEXT: ADD * T0.W, T0.W, PV.X, +; R600-NEXT: ADD T4.Z, T2.W, PV.W, +; R600-NEXT: BFI_INT T0.W, literal.x, PV.Z, KC0[3].Z, +; R600-NEXT: SETGE * T1.W, |PV.Y|, 0.5, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: SETGE T3.Y, |PV.W|, 0.5, -; R600-NEXT: ADD T0.Z, T2.W, PV.Z, -; R600-NEXT: BFI_INT T1.W, literal.x, 1.0, KC0[3].Y, -; R600-NEXT: CNDE * T2.W, PV.X, 0.0, PV.Y, BS:VEC_021/SCL_122 +; R600-NEXT: ADD T4.Y, T0.Z, PV.W, +; R600-NEXT: BFI_INT * T0.W, literal.x, PS, KC0[3].Y, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: ADD T0.Y, T0.Y, PS, -; R600-NEXT: CNDE * T1.W, PV.Y, 0.0, PV.W, -; R600-NEXT: ADD T0.X, T1.Y, PV.W, -; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: ADD T4.X, T3.W, PV.W, +; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 store <4 x float> %result, ptr addrspace(1) %out @@ -410,67 +413,67 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # ; GFX6-LABEL: round_v8f32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; GFX6-NEXT: s_brev_b32 s12, -2 +; GFX6-NEXT: s_brev_b32 s14, -2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_trunc_f32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v0, s7 -; GFX6-NEXT: v_sub_f32_e32 v2, s7, v1 -; GFX6-NEXT: v_bfi_b32 v0, s12, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_add_f32_e32 v3, v1, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_sub_f32_e32 v2, s6, v1 -; GFX6-NEXT: v_bfi_b32 v0, s12, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_add_f32_e32 v2, v1, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s5 -; GFX6-NEXT: v_sub_f32_e32 v4, s5, v1 -; GFX6-NEXT: v_bfi_b32 v0, s12, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_trunc_f32_e32 v4, s4 -; GFX6-NEXT: v_add_f32_e32 v1, v1, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_sub_f32_e32 v5, s4, v4 -; GFX6-NEXT: v_bfi_b32 v0, s12, 1.0, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_trunc_f32_e32 v5, s11 -; GFX6-NEXT: v_add_f32_e32 v0, v4, v0 -; GFX6-NEXT: v_mov_b32_e32 v4, s11 -; GFX6-NEXT: v_sub_f32_e32 v6, s11, v5 -; GFX6-NEXT: v_bfi_b32 v4, s12, 1.0, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX6-NEXT: v_add_f32_e32 v7, v5, v4 -; GFX6-NEXT: v_trunc_f32_e32 v5, s10 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NEXT: v_sub_f32_e32 v6, s10, v5 -; GFX6-NEXT: v_bfi_b32 v4, s12, 1.0, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX6-NEXT: v_add_f32_e32 v6, v5, v4 -; GFX6-NEXT: v_trunc_f32_e32 v5, s9 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: v_sub_f32_e32 v8, s9, v5 -; GFX6-NEXT: v_bfi_b32 v4, s12, 1.0, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v8|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX6-NEXT: v_trunc_f32_e32 v8, s8 -; GFX6-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s8 -; GFX6-NEXT: v_sub_f32_e32 v9, s8, v8 -; GFX6-NEXT: v_bfi_b32 v4, s12, 1.0, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX6-NEXT: v_add_f32_e32 v4, v8, v4 +; GFX6-NEXT: v_trunc_f32_e32 v0, s7 +; GFX6-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX6-NEXT: v_trunc_f32_e32 v0, s6 +; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX6-NEXT: v_trunc_f32_e32 v0, s5 +; GFX6-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v4, s5 +; GFX6-NEXT: v_bfi_b32 v1, s14, v1, v4 +; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX6-NEXT: v_trunc_f32_e32 v0, s4 +; GFX6-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NEXT: v_bfi_b32 v4, s14, v4, v5 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX6-NEXT: v_trunc_f32_e32 v4, s11 +; GFX6-NEXT: v_sub_f32_e32 v5, s11, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v6, s11 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX6-NEXT: v_add_f32_e32 v7, v4, v5 +; GFX6-NEXT: v_trunc_f32_e32 v4, s10 +; GFX6-NEXT: v_sub_f32_e32 v5, s10, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v6, s10 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX6-NEXT: v_add_f32_e32 v6, v4, v5 +; GFX6-NEXT: v_trunc_f32_e32 v4, s9 +; GFX6-NEXT: v_sub_f32_e32 v5, s9, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v8, s9 +; GFX6-NEXT: v_bfi_b32 v5, s14, v5, v8 +; GFX6-NEXT: v_add_f32_e32 v5, v4, v5 +; GFX6-NEXT: v_trunc_f32_e32 v4, s8 +; GFX6-NEXT: v_sub_f32_e32 v8, s8, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v9, s8 +; GFX6-NEXT: v_bfi_b32 v8, s14, v8, v9 +; GFX6-NEXT: v_add_f32_e32 v4, v4, v8 ; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -478,67 +481,67 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # ; GFX89-LABEL: round_v8f32: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GFX89-NEXT: s_brev_b32 s12, -2 +; GFX89-NEXT: s_brev_b32 s14, -2 ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b32 s3, 0xf000 ; GFX89-NEXT: s_mov_b32 s2, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v1, s7 -; GFX89-NEXT: v_mov_b32_e32 v0, s7 -; GFX89-NEXT: v_sub_f32_e32 v2, s7, v1 -; GFX89-NEXT: v_bfi_b32 v0, s12, 1.0, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_add_f32_e32 v3, v1, v0 -; GFX89-NEXT: v_trunc_f32_e32 v1, s6 -; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_sub_f32_e32 v2, s6, v1 -; GFX89-NEXT: v_bfi_b32 v0, s12, 1.0, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_add_f32_e32 v2, v1, v0 -; GFX89-NEXT: v_trunc_f32_e32 v1, s5 -; GFX89-NEXT: v_mov_b32_e32 v0, s5 -; GFX89-NEXT: v_sub_f32_e32 v4, s5, v1 -; GFX89-NEXT: v_bfi_b32 v0, s12, 1.0, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_trunc_f32_e32 v4, s4 -; GFX89-NEXT: v_add_f32_e32 v1, v1, v0 -; GFX89-NEXT: v_mov_b32_e32 v0, s4 -; GFX89-NEXT: v_sub_f32_e32 v5, s4, v4 -; GFX89-NEXT: v_bfi_b32 v0, s12, 1.0, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX89-NEXT: v_trunc_f32_e32 v5, s11 -; GFX89-NEXT: v_add_f32_e32 v0, v4, v0 -; GFX89-NEXT: v_mov_b32_e32 v4, s11 -; GFX89-NEXT: v_sub_f32_e32 v6, s11, v5 -; GFX89-NEXT: v_bfi_b32 v4, s12, 1.0, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX89-NEXT: v_add_f32_e32 v7, v5, v4 -; GFX89-NEXT: v_trunc_f32_e32 v5, s10 -; GFX89-NEXT: v_mov_b32_e32 v4, s10 -; GFX89-NEXT: v_sub_f32_e32 v6, s10, v5 -; GFX89-NEXT: v_bfi_b32 v4, s12, 1.0, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX89-NEXT: v_add_f32_e32 v6, v5, v4 -; GFX89-NEXT: v_trunc_f32_e32 v5, s9 -; GFX89-NEXT: v_mov_b32_e32 v4, s9 -; GFX89-NEXT: v_sub_f32_e32 v8, s9, v5 -; GFX89-NEXT: v_bfi_b32 v4, s12, 1.0, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v8|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX89-NEXT: v_trunc_f32_e32 v8, s8 -; GFX89-NEXT: v_add_f32_e32 v5, v5, v4 -; GFX89-NEXT: v_mov_b32_e32 v4, s8 -; GFX89-NEXT: v_sub_f32_e32 v9, s8, v8 -; GFX89-NEXT: v_bfi_b32 v4, s12, 1.0, v4 -; GFX89-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX89-NEXT: v_add_f32_e32 v4, v8, v4 +; GFX89-NEXT: v_trunc_f32_e32 v0, s7 +; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX89-NEXT: v_mov_b32_e32 v2, s7 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v3, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s6 +; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13] +; GFX89-NEXT: v_mov_b32_e32 v2, s6 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2 +; GFX89-NEXT: v_add_f32_e32 v2, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s5 +; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7] +; GFX89-NEXT: v_mov_b32_e32 v4, s5 +; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4 +; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 +; GFX89-NEXT: v_trunc_f32_e32 v0, s4 +; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7] +; GFX89-NEXT: v_mov_b32_e32 v5, s4 +; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5 +; GFX89-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX89-NEXT: v_trunc_f32_e32 v4, s11 +; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_mov_b32_e32 v6, s11 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX89-NEXT: v_add_f32_e32 v7, v4, v5 +; GFX89-NEXT: v_trunc_f32_e32 v4, s10 +; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_mov_b32_e32 v6, s10 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6 +; GFX89-NEXT: v_add_f32_e32 v6, v4, v5 +; GFX89-NEXT: v_trunc_f32_e32 v4, s9 +; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] +; GFX89-NEXT: v_mov_b32_e32 v8, s9 +; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8 +; GFX89-NEXT: v_add_f32_e32 v5, v4, v5 +; GFX89-NEXT: v_trunc_f32_e32 v4, s8 +; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] +; GFX89-NEXT: v_mov_b32_e32 v9, s8 +; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9 +; GFX89-NEXT: v_add_f32_e32 v4, v4, v8 ; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX89-NEXT: s_endpgm @@ -549,55 +552,66 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v1, s7 -; GFX11-NEXT: v_trunc_f32_e32 v4, s6 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, 1.0, s7 -; GFX11-NEXT: v_trunc_f32_e32 v5, s5 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, 1.0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_dual_sub_f32 v16, s7, v1 :: v_dual_sub_f32 v17, s6, v4 +; GFX11-NEXT: v_trunc_f32_e32 v0, s7 +; GFX11-NEXT: v_trunc_f32_e32 v1, s6 +; GFX11-NEXT: v_trunc_f32_e32 v4, s5 ; GFX11-NEXT: v_trunc_f32_e32 v8, s4 -; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, 1.0, s11 -; GFX11-NEXT: v_trunc_f32_e32 v9, s11 -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v16|, 0.5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_dual_sub_f32 v18, s5, v5 :: v_dual_sub_f32 v19, s4, v8 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, 1.0, s5 -; GFX11-NEXT: v_trunc_f32_e32 v11, s10 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v17|, 0.5 -; GFX11-NEXT: v_sub_f32_e32 v20, s11, v9 -; GFX11-NEXT: v_trunc_f32_e32 v13, s9 -; GFX11-NEXT: v_sub_f32_e32 v21, s10, v11 -; GFX11-NEXT: v_bfi_b32 v10, 0x7fffffff, 1.0, s10 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, 0.5 -; GFX11-NEXT: v_trunc_f32_e32 v15, s8 -; GFX11-NEXT: v_sub_f32_e32 v22, s9, v13 -; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, 1.0, s9 -; GFX11-NEXT: v_bfi_b32 v14, 0x7fffffff, 1.0, s8 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0, v3, vcc_lo -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, 0.5 -; GFX11-NEXT: v_sub_f32_e32 v23, s8, v15 -; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, 1.0, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc_lo -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, 0.5 -; GFX11-NEXT: v_dual_add_f32 v3, v1, v0 :: v_dual_add_f32 v2, v4, v2 -; GFX11-NEXT: v_add_f32_e32 v1, v5, v16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_add_f32 v7, v9, v7 :: v_dual_cndmask_b32 v0, 0, v10 -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v12, vcc_lo -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, 0.5 -; GFX11-NEXT: v_dual_add_f32 v5, v13, v4 :: v_dual_cndmask_b32 v10, 0, v14 -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, 0.5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v4, v15, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v6, vcc_lo -; GFX11-NEXT: v_add_f32_e32 v6, v11, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v8, v12 +; GFX11-NEXT: v_trunc_f32_e32 v5, s11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1 +; GFX11-NEXT: v_sub_f32_e32 v7, s5, v4 +; GFX11-NEXT: v_trunc_f32_e32 v9, s9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_f32_e32 v12, s11, v5 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5 +; GFX11-NEXT: v_sub_f32_e32 v11, s4, v8 +; GFX11-NEXT: v_trunc_f32_e32 v6, s10 +; GFX11-NEXT: v_sub_f32_e32 v14, s9, v9 +; GFX11-NEXT: v_trunc_f32_e32 v10, s8 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5 +; GFX11-NEXT: v_sub_f32_e32 v13, s10, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16 +; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5 +; GFX11-NEXT: v_add_f32_e32 v1, v4, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s11 +; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2 +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v7, v5, v12 +; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s10 +; GFX11-NEXT: v_sub_f32_e32 v15, s8, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2 +; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s8 +; GFX11-NEXT: v_add_f32_e32 v4, v10, v4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 @@ -607,68 +621,58 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) # ; ; R600-LABEL: round_v8f32: ; R600: ; %bb.0: -; R600-NEXT: ALU 60, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 50, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0 -; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T1.X, 1 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: ALU clause starting at 4: ; R600-NEXT: TRUNC * T0.W, KC0[6].X, -; R600-NEXT: BFI_INT T0.X, literal.x, 1.0, KC0[5].Z, -; R600-NEXT: BFI_INT T0.Y, literal.x, 1.0, KC0[4].Y, -; R600-NEXT: TRUNC T0.Z, KC0[4].Y, -; R600-NEXT: TRUNC * T1.W, KC0[5].Z, -; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: ADD * T2.W, KC0[6].X, -T0.W, -; R600-NEXT: TRUNC T1.Y, KC0[5].X, -; R600-NEXT: SETGE T1.Z, |PV.W|, 0.5, -; R600-NEXT: BFI_INT * T2.W, literal.x, 1.0, KC0[6].X, -; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: ADD * T3.W, KC0[5].Z, -T1.W, -; R600-NEXT: SETGE T1.X, |PV.W|, 0.5, -; R600-NEXT: CNDE T2.Y, T1.Z, 0.0, T2.W, -; R600-NEXT: ADD T1.Z, KC0[5].X, -T1.Y, +; R600-NEXT: ADD T0.Z, KC0[6].X, -PV.W, +; R600-NEXT: TRUNC * T1.W, KC0[5].X, ; R600-NEXT: TRUNC * T2.W, KC0[4].W, -; R600-NEXT: ADD * T3.W, KC0[4].Y, -T0.Z, -; R600-NEXT: SETGE T2.X, |PV.W|, 0.5, -; R600-NEXT: ADD T3.Y, KC0[4].W, -T2.W, -; R600-NEXT: TRUNC T2.Z, KC0[4].Z, -; R600-NEXT: BFI_INT T3.W, literal.x, 1.0, KC0[5].X, -; R600-NEXT: SETGE * T4.W, |T1.Z|, 0.5, +; R600-NEXT: ADD T1.Z, KC0[4].W, -PV.W, +; R600-NEXT: ADD T3.W, KC0[5].X, -T1.W, +; R600-NEXT: SETGE * T4.W, |T0.Z|, 0.5, +; R600-NEXT: BFI_INT T0.Y, literal.x, PS, KC0[6].X, +; R600-NEXT: SETGE T0.Z, |PV.W|, 0.5, +; R600-NEXT: SETGE T3.W, |PV.Z|, 0.5, +; R600-NEXT: TRUNC * T4.W, KC0[5].Y, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: CNDE T3.X, PS, 0.0, PV.W, -; R600-NEXT: ADD T4.Y, KC0[4].Z, -PV.Z, -; R600-NEXT: SETGE T1.Z, |PV.Y|, 0.5, -; R600-NEXT: BFI_INT T3.W, literal.x, 1.0, KC0[4].W, -; R600-NEXT: TRUNC * T4.W, KC0[5].W, +; R600-NEXT: ADD T1.Y, KC0[5].Y, -PS, +; R600-NEXT: BFI_INT T1.Z, literal.x, PV.W, KC0[4].W, +; R600-NEXT: BFI_INT T3.W, literal.x, PV.Z, KC0[5].X, +; R600-NEXT: TRUNC * T5.W, KC0[4].Z, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: ADD T4.X, KC0[5].W, -PS, -; R600-NEXT: CNDE T3.Y, PV.Z, 0.0, PV.W, -; R600-NEXT: SETGE T1.Z, |PV.Y|, 0.5, -; R600-NEXT: BFI_INT T3.W, literal.x, 1.0, KC0[4].Z, -; R600-NEXT: ADD * T5.W, T1.Y, PV.X, +; R600-NEXT: TRUNC T0.Z, KC0[4].Y, +; R600-NEXT: TRUNC * T6.W, KC0[5].W, +; R600-NEXT: ADD * T7.W, KC0[4].Z, -T5.W, +; R600-NEXT: TRUNC T0.X, KC0[5].Z, +; R600-NEXT: SETGE T2.Y, |PV.W|, 0.5, +; R600-NEXT: ADD T2.Z, KC0[5].W, -T6.W, BS:VEC_102/SCL_221 +; R600-NEXT: ADD T7.W, KC0[4].Y, -T0.Z, +; R600-NEXT: ADD * T3.W, T1.W, T3.W, +; R600-NEXT: SETGE T1.X, |PV.W|, 0.5, +; R600-NEXT: SETGE T4.Y, |PV.Z|, 0.5, +; R600-NEXT: ADD T3.Z, T2.W, T1.Z, +; R600-NEXT: BFI_INT T1.W, literal.x, PV.Y, KC0[4].Z, +; R600-NEXT: ADD * T2.W, KC0[5].Z, -PV.X, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: TRUNC T3.X, KC0[5].Y, -; R600-NEXT: CNDE T1.Y, PV.Z, 0.0, PV.W, -; R600-NEXT: ADD T5.Z, T2.W, PV.Y, -; R600-NEXT: BFI_INT T2.W, literal.x, 1.0, KC0[5].W, -; R600-NEXT: SETGE * T3.W, |PV.X|, 0.5, +; R600-NEXT: SETGE T2.X, |PS|, 0.5, +; R600-NEXT: ADD T3.Y, T5.W, PV.W, +; R600-NEXT: BFI_INT T1.Z, literal.x, PV.Y, KC0[5].W, +; R600-NEXT: BFI_INT T1.W, literal.x, PV.X, KC0[4].Y, +; R600-NEXT: ADD * T0.W, T0.W, T0.Y, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: CNDE T4.X, PS, 0.0, PV.W, -; R600-NEXT: ADD T5.Y, T2.Z, PV.Y, -; R600-NEXT: ADD T1.Z, KC0[5].Y, -PV.X, -; R600-NEXT: CNDE T2.W, T2.X, 0.0, T0.Y, -; R600-NEXT: ADD * T0.W, T0.W, T2.Y, -; R600-NEXT: ADD T5.X, T0.Z, PV.W, -; R600-NEXT: SETGE T1.Y, |PV.Z|, 0.5, -; R600-NEXT: ADD T0.Z, T4.W, PV.X, -; R600-NEXT: BFI_INT T2.W, literal.x, 1.0, KC0[5].Y, -; R600-NEXT: CNDE * T3.W, T1.X, 0.0, T0.X, BS:VEC_021/SCL_122 +; R600-NEXT: ADD T3.X, T0.Z, PV.W, +; R600-NEXT: ADD T0.Z, T6.W, PV.Z, +; R600-NEXT: BFI_INT T1.W, literal.x, PV.X, KC0[5].Z, +; R600-NEXT: SETGE * T2.W, |T1.Y|, 0.5, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) ; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, -; R600-NEXT: ADD T0.Y, T1.W, PS, -; R600-NEXT: CNDE * T1.W, PV.Y, 0.0, PV.W, -; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; R600-NEXT: ADD T0.X, T3.X, PV.W, +; R600-NEXT: ADD T0.Y, T0.X, PV.W, +; R600-NEXT: BFI_INT * T1.W, literal.y, PS, KC0[5].Y, +; R600-NEXT: 2(2.802597e-45), 2147483647(nan) +; R600-NEXT: ADD T0.X, T4.W, PV.W, ; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, ; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; R600-NEXT: LSHR * T2.X, PV.W, literal.x, @@ -683,55 +687,55 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX6-NEXT: v_trunc_f32_e32 v1, v0 +; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3] ; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: v_trunc_f32_e32 v2, v0 -; GFX6-NEXT: v_bfi_b32 v1, s2, 1.0, v0 -; GFX6-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX6-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: round_f16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c -; GFX8-NEXT: s_movk_i32 s1, 0x7fff +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_movk_i32 s5, 0x7fff +; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1 -; GFX8-NEXT: v_trunc_f16_e32 v1, s0 -; GFX8-NEXT: v_sub_f16_e32 v2, s0, v1 +; GFX8-NEXT: v_trunc_f16_e32 v1, s4 +; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 ; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 -; GFX8-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: round_f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1 ; GFX9-NEXT: v_trunc_f16_e32 v1, s2 ; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 ; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -741,18 +745,16 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_movk_i32 s3, 0x3c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_trunc_f16_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v2, s2, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 +; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v2|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -762,7 +764,7 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; ; R600-LABEL: round_f16: ; R600: ; %bb.0: -; R600-NEXT: ALU 19, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT MSKOR T0.XW, T1.X ; R600-NEXT: CF_END ; R600-NEXT: PAD @@ -770,12 +772,10 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; R600-NEXT: FLT16_TO_FLT32 * T0.W, KC0[2].Z, ; R600-NEXT: TRUNC * T1.W, PV.W, ; R600-NEXT: ADD * T2.W, T0.W, -PV.W, -; R600-NEXT: BFI_INT T0.W, literal.x, 1.0, T0.W, ; R600-NEXT: SETGE * T2.W, |PV.W|, 0.5, -; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: CNDE T0.W, PS, 0.0, PV.W, -; R600-NEXT: AND_INT * T2.W, KC0[2].Y, literal.x, -; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; R600-NEXT: BFI_INT T0.W, literal.x, PV.W, T0.W, +; R600-NEXT: AND_INT * T2.W, KC0[2].Y, literal.y, +; R600-NEXT: 2147483647(nan), 3(4.203895e-45) ; R600-NEXT: ADD * T0.W, T1.W, PV.W, ; R600-NEXT: FLT32_TO_FLT16 T0.W, PV.W, ; R600-NEXT: LSHL * T1.W, T2.W, literal.x, @@ -799,29 +799,29 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX6-LABEL: round_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s2 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: v_trunc_f32_e32 v3, v0 -; GFX6-NEXT: v_bfi_b32 v2, s2, 1.0, v0 -; GFX6-NEXT: v_trunc_f32_e32 v4, v1 -; GFX6-NEXT: v_bfi_b32 v5, s2, 1.0, v1 -; GFX6-NEXT: v_sub_f32_e32 v1, v1, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, 0.5 -; GFX6-NEXT: v_sub_f32_e32 v0, v0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 -; GFX6-NEXT: v_add_f32_e32 v1, v4, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX6-NEXT: v_trunc_f32_e32 v3, v1 +; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v2, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, 0.5 +; GFX6-NEXT: v_sub_f32_e32 v4, v0, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[2:3] +; GFX6-NEXT: v_bfi_b32 v1, s4, v5, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5 +; GFX6-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[2:3] +; GFX6-NEXT: v_bfi_b32 v0, s4, v3, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v2, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -830,55 +830,55 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX8-NEXT: s_movk_i32 s6, 0x7fff ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s6, s4, 16 -; GFX8-NEXT: v_trunc_f16_e32 v0, s6 -; GFX8-NEXT: v_sub_f16_e32 v1, s6, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_bfi_b32 v3, s5, v2, v3 -; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v1|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_bfi_b32 v1, s5, v2, v1 +; GFX8-NEXT: s_lshr_b32 s5, s4, 16 +; GFX8-NEXT: v_trunc_f16_e32 v1, s5 +; GFX8-NEXT: v_sub_f16_e32 v2, s5, v1 +; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_bfi_b32 v2, s6, v2, v3 +; GFX8-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_trunc_f16_e32 v2, s4 ; GFX8-NEXT: v_sub_f16_e32 v3, s4, v2 ; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX8-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_bfi_b32 v0, s6, v0, v3 +; GFX8-NEXT: v_add_f16_e32 v0, v2, v0 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: round_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s0, 0x7fff +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 +; GFX9-NEXT: s_movk_i32 s1, 0x7fff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: v_trunc_f16_e32 v1, s0 +; GFX9-NEXT: v_sub_f16_e32 v2, s0, v1 +; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_bfi_b32 v2, s1, v2, v3 +; GFX9-NEXT: v_add_f16_e32 v1, v1, v2 ; GFX9-NEXT: v_trunc_f16_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_sub_f16_e32 v3, s2, v2 -; GFX9-NEXT: v_bfi_b32 v1, s0, v0, v1 -; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: v_add_f16_e32 v1, v2, v1 -; GFX9-NEXT: v_trunc_f16_e32 v2, s1 -; GFX9-NEXT: v_sub_f16_e32 v3, s1, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v4 ; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -887,28 +887,29 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: s_movk_i32 s4, 0x3c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f16_e32 v0, s2 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: v_trunc_f16_e32 v2, s3 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: v_sub_f16_e32 v4, s2, v0 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s4, v1 -; GFX11-NEXT: v_sub_f16_e32 v5, s3, v2 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s4, v3 -; GFX11-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v4|, 0.5 +; GFX11-NEXT: v_trunc_f16_e32 v1, s2 +; GFX11-NEXT: v_trunc_f16_e32 v0, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 +; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX11-NEXT: v_cmp_ge_f16_e64 vcc_lo, |v5|, 0.5 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v1, v2, v3 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -916,7 +917,7 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; R600-LABEL: round_v2f16: ; R600: ; %bb.0: -; R600-NEXT: ALU 24, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: ALU 22, @4, KC0[CB0:0-32], KC1[] ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; R600-NEXT: CF_END ; R600-NEXT: PAD @@ -928,20 +929,18 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; R600-NEXT: TRUNC * T2.W, PV.W, ; R600-NEXT: ADD T3.W, T0.W, -PS, ; R600-NEXT: TRUNC * T4.W, PV.W, -; R600-NEXT: ADD T0.Z, T1.W, -PS, -; R600-NEXT: BFI_INT T0.W, literal.x, 1.0, T0.W, +; R600-NEXT: ADD T5.W, T1.W, -PS, +; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5, +; R600-NEXT: BFI_INT T0.W, literal.x, PS, T0.W, ; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: CNDE T1.Z, PS, 0.0, PV.W, -; R600-NEXT: BFI_INT T0.W, literal.x, 1.0, T1.W, -; R600-NEXT: SETGE * T1.W, |PV.Z|, 0.5, +; R600-NEXT: BFI_INT T1.W, literal.x, PS, T1.W, BS:VEC_021/SCL_122 +; R600-NEXT: ADD * T0.W, T2.W, PV.W, ; R600-NEXT: 2147483647(nan), 0(0.000000e+00) -; R600-NEXT: CNDE T0.W, PS, 0.0, PV.W, -; R600-NEXT: ADD * T1.W, T2.W, PV.Z, -; R600-NEXT: FLT32_TO_FLT16 T1.W, PS, -; R600-NEXT: ADD * T0.W, T4.W, PV.W, ; R600-NEXT: FLT32_TO_FLT16 T0.W, PS, -; R600-NEXT: LSHL * T1.W, PV.W, literal.x, +; R600-NEXT: ADD * T1.W, T4.W, PV.W, +; R600-NEXT: FLT32_TO_FLT16 T1.W, PS, +; R600-NEXT: LSHL * T0.W, PV.W, literal.x, ; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; R600-NEXT: OR_INT T0.X, PV.W, PS, ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 236faf8ecf2e8..438b1bfe319a0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -1282,17 +1282,17 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s2 -; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ushort v12, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v3, v12, 3, 1 ; GFX6-NEXT: v_bfe_u32 v1, v12, 1, 1 @@ -1310,10 +1310,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX6-NEXT: v_bfe_u32 v8, v12, 8, 1 ; GFX6-NEXT: v_bfe_u32 v14, v12, 14, 1 ; GFX6-NEXT: v_bfe_u32 v12, v12, 12, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v16i1_to_v16i32: @@ -1442,17 +1442,17 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_sextload_v16i1_to_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s2 -; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ushort v12, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v3, v12, 3, 1 ; GFX6-NEXT: v_bfe_i32 v2, v12, 2, 1 @@ -1470,10 +1470,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o ; GFX6-NEXT: v_bfe_i32 v14, v12, 14, 1 ; GFX6-NEXT: v_bfe_i32 v13, v12, 13, 1 ; GFX6-NEXT: v_bfe_i32 v12, v12, 12, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v16i1_to_v16i32: @@ -2428,12 +2428,12 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s3, 24 -; GFX8-NEXT: s_lshr_b32 s7, s2, 24 +; GFX8-NEXT: s_lshr_b32 s6, s3, 24 +; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10018 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x10018 -; GFX8-NEXT: s_and_b32 s10, s3, 1 -; GFX8-NEXT: s_and_b32 s11, s2, 1 +; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10018 +; GFX8-NEXT: s_and_b32 s7, s3, 1 +; GFX8-NEXT: s_and_b32 s9, s2, 1 ; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013 ; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10012 ; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10011 @@ -2446,170 +2446,170 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10012 ; GFX8-NEXT: s_bfe_u32 s22, s3, 0x10011 ; GFX8-NEXT: s_bfe_u32 s23, s3, 0x10010 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x10017 -; GFX8-NEXT: s_bfe_u32 s9, s3, 0x10016 +; GFX8-NEXT: s_bfe_u32 s10, s3, 0x10017 +; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10016 ; GFX8-NEXT: s_bfe_u32 s24, s3, 0x10015 ; GFX8-NEXT: s_bfe_u32 s25, s3, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v25, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v24, s9 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v25, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v24, s11 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v22, s25 ; GFX8-NEXT: v_mov_b32_e32 v23, s24 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v22, s23 ; GFX8-NEXT: v_mov_b32_e32 v23, s22 ; GFX8-NEXT: v_mov_b32_e32 v24, s21 ; GFX8-NEXT: v_mov_b32_e32 v25, s20 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v22, s19 ; GFX8-NEXT: v_mov_b32_e32 v23, s18 ; GFX8-NEXT: v_mov_b32_e32 v24, s17 ; GFX8-NEXT: v_mov_b32_e32 v25, s16 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v22, s15 ; GFX8-NEXT: v_mov_b32_e32 v23, s14 ; GFX8-NEXT: v_mov_b32_e32 v24, s13 ; GFX8-NEXT: v_mov_b32_e32 v25, s12 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3 -; GFX8-NEXT: v_mov_b32_e32 v25, s9 +; GFX8-NEXT: v_mov_b32_e32 v25, s11 ; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2 ; GFX8-NEXT: v_and_b32_e32 v21, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v27, 1, v22 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s3 -; GFX8-NEXT: v_mov_b32_e32 v24, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 32 +; GFX8-NEXT: v_mov_b32_e32 v24, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 32 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2 ; GFX8-NEXT: v_and_b32_e32 v28, 1, v22 ; GFX8-NEXT: v_and_b32_e32 v22, 1, v20 ; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2 ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v18 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s2 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v19 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2 ; GFX8-NEXT: v_and_b32_e32 v17, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v2 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 ; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 3, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s3 ; GFX8-NEXT: v_mov_b32_e32 v25, 1 -; GFX8-NEXT: v_mov_b32_e32 v21, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v18 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX8-NEXT: v_and_b32_sdwa v16, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v20, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s5 +; GFX8-NEXT: v_mov_b32_e32 v21, s11 ; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_and_b32_e32 v20, 1, v11 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 1, s5 -; GFX8-NEXT: s_add_u32 s8, s0, 16 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX8-NEXT: v_and_b32_sdwa v16, v14, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v20, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s6 ; GFX8-NEXT: v_and_b32_e32 v15, 1, v0 +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GFX8-NEXT: v_and_b32_e32 v20, 1, v14 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6 +; GFX8-NEXT: s_add_u32 s10, s0, 16 +; GFX8-NEXT: v_and_b32_e32 v17, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 3, s6 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v11 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s5 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 4, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v15 +; GFX8-NEXT: v_mov_b32_e32 v16, s11 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2 -; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v15 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v11 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v12 -; GFX8-NEXT: v_mov_b32_e32 v12, s9 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v9 -; GFX8-NEXT: v_mov_b32_e32 v11, s8 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s2 -; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[14:17] -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s7 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX8-NEXT: v_mov_b32_e32 v14, s1 -; GFX8-NEXT: s_add_u32 s8, s0, 0xb0 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX8-NEXT: v_mov_b32_e32 v15, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v0 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s3 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: v_mov_b32_e32 v13, s0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s7 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s8 +; GFX8-NEXT: v_mov_b32_e32 v13, s1 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v11 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX8-NEXT: s_add_u32 s10, s0, 0xb0 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s8 ; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 14, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s3 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; GFX8-NEXT: v_and_b32_e32 v11, 1, v8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s8 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v7 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 3, s7 -; GFX8-NEXT: v_mov_b32_e32 v10, s9 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 6, s7 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s3 ; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: v_mov_b32_e32 v10, s11 ; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s8 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] -; GFX8-NEXT: v_and_b32_e32 v10, 1, v17 +; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 ; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s5 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13 -; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v16 -; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v18 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v18 ; GFX8-NEXT: v_and_b32_e32 v18, 1, v3 ; GFX8-NEXT: v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s7 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v19 ; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v15 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v21 ; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v1 ; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v20 ; GFX8-NEXT: v_and_b32_e32 v20, 1, v0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x80 ; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 @@ -2621,30 +2621,30 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v28 ; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v20, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s6 ; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[1:4] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s6 ; GFX8-NEXT: v_and_b32_e32 v16, 1, v24 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s8 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v6, 1, v12 -; GFX8-NEXT: v_mov_b32_e32 v12, s6 +; GFX8-NEXT: v_mov_b32_e32 v12, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s0, s0, 0x60 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -2669,173 +2669,173 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T41.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T20.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T21.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T34.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T32.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T30.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T28.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T26.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T24.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T24.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T22.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_64 T20.XY, T19.X, 0, #1 +; EG-NEXT: VTX_READ_64 T21.XY, T19.X, 0, #1 ; EG-NEXT: ALU clause starting at 24: ; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 25: -; EG-NEXT: BFE_UINT * T19.W, T20.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T19.W, T21.X, literal.x, 1, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T19.Z, T20.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T19.Z, T21.X, literal.x, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.Y, T20.X, 1, 1, -; EG-NEXT: BFE_UINT * T21.W, T20.X, literal.x, 1, +; EG-NEXT: BFE_UINT T19.Y, T21.X, 1, 1, +; EG-NEXT: BFE_UINT * T20.W, T21.X, literal.x, 1, ; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T19.X, T20.X, 1, -; EG-NEXT: BFE_UINT T21.Z, T20.X, literal.x, 1, +; EG-NEXT: AND_INT T19.X, T21.X, 1, +; EG-NEXT: BFE_UINT T20.Z, T21.X, literal.x, 1, ; EG-NEXT: LSHR * T22.X, KC0[2].Y, literal.y, ; EG-NEXT: 6(8.407791e-45), 2(2.802597e-45) -; EG-NEXT: BFE_UINT T21.Y, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT * T23.W, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T20.Y, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T23.W, T21.X, literal.y, 1, ; EG-NEXT: 5(7.006492e-45), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T21.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T23.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T20.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T23.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T24.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T23.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T25.W, T20.X, literal.z, 1, +; EG-NEXT: BFE_UINT T23.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T25.W, T21.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T23.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T25.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T23.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T25.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44) ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T26.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T25.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T27.W, T20.X, literal.z, 1, +; EG-NEXT: BFE_UINT T25.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T27.W, T21.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44) ; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T25.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T27.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T25.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T27.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44) ; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T28.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T27.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T29.W, T20.X, literal.z, 1, +; EG-NEXT: BFE_UINT T27.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T29.W, T21.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T27.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T29.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T27.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T29.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44) ; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T30.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T29.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T31.W, T20.X, literal.z, 1, +; EG-NEXT: BFE_UINT T29.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T31.W, T21.X, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44) ; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T29.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T31.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T29.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T31.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44) ; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T32.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T31.Y, T20.X, literal.y, 1, -; EG-NEXT: LSHR * T33.W, T20.X, literal.z, +; EG-NEXT: BFE_UINT T31.Y, T21.X, literal.y, 1, +; EG-NEXT: LSHR * T33.W, T21.X, literal.z, ; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T31.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T33.Z, T20.X, literal.y, 1, +; EG-NEXT: BFE_UINT T31.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T33.Z, T21.X, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44) ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T34.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T33.Y, T20.X, literal.y, 1, -; EG-NEXT: BFE_UINT * T35.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T33.Y, T21.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T35.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44) ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T33.X, T20.X, literal.x, 1, -; EG-NEXT: BFE_UINT T35.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T33.X, T21.X, literal.x, 1, +; EG-NEXT: BFE_UINT T35.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 28(3.923636e-44), 2(2.802597e-45) ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) -; EG-NEXT: LSHR T20.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T35.Y, T20.Y, 1, 1, -; EG-NEXT: BFE_UINT T36.W, T20.Y, literal.y, 1, -; EG-NEXT: AND_INT * T35.X, T20.Y, 1, +; EG-NEXT: LSHR T21.X, PV.W, literal.x, +; EG-NEXT: BFE_UINT T35.Y, T21.Y, 1, 1, +; EG-NEXT: BFE_UINT T36.W, T21.Y, literal.y, 1, +; EG-NEXT: AND_INT * T35.X, T21.Y, 1, ; EG-NEXT: 2(2.802597e-45), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T36.Z, T20.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T36.Z, T21.Y, literal.x, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 6(8.407791e-45), 128(1.793662e-43) ; EG-NEXT: LSHR T37.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T36.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T38.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T36.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T38.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 5(7.006492e-45) ; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T36.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T38.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T36.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T38.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 4(5.605194e-45), 10(1.401298e-44) ; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 122: ; EG-NEXT: LSHR T39.X, T0.W, literal.x, -; EG-NEXT: BFE_UINT T38.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T40.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T38.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T40.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T38.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T40.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T38.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T40.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 8(1.121039e-44), 14(1.961818e-44) ; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T41.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T40.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T42.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T40.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T42.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 13(1.821688e-44) ; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T40.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T42.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T40.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T42.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 12(1.681558e-44), 18(2.522337e-44) ; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T43.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T42.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T44.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T42.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T44.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 17(2.382207e-44) ; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T42.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T44.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T42.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T44.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 22(3.082857e-44) ; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T45.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T44.Y, T20.Y, literal.y, 1, -; EG-NEXT: BFE_UINT * T46.W, T20.Y, literal.z, 1, +; EG-NEXT: BFE_UINT T44.Y, T21.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T46.W, T21.Y, literal.z, 1, ; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44) ; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T44.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T46.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T44.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T46.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 20(2.802597e-44), 26(3.643376e-44) ; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T47.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T46.Y, T20.Y, literal.y, 1, -; EG-NEXT: LSHR * T48.W, T20.Y, literal.z, +; EG-NEXT: BFE_UINT T46.Y, T21.Y, literal.y, 1, +; EG-NEXT: LSHR * T48.W, T21.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 25(3.503246e-44) ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T46.X, T20.Y, literal.x, 1, -; EG-NEXT: BFE_UINT T48.Z, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T46.X, T21.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T48.Z, T21.Y, literal.y, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; EG-NEXT: 24(3.363116e-44), 30(4.203895e-44) ; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T49.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT * T48.Y, T20.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T48.Y, T21.Y, literal.y, 1, ; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T48.X, T20.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T48.X, T21.Y, literal.x, 1, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 28(3.923636e-44), 240(3.363116e-43) ; EG-NEXT: LSHR * T50.X, PV.W, literal.x, @@ -3031,15 +3031,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s2 -; GFX8-NEXT: s_lshr_b32 s6, s3, 24 -; GFX8-NEXT: s_lshr_b32 s7, s2, 24 +; GFX8-NEXT: s_lshr_b32 s7, s3, 24 +; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018 ; GFX8-NEXT: s_bfe_i32 s5, s3, 0x10018 -; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10000 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x10000 +; GFX8-NEXT: s_bfe_i32 s6, s3, 0x10000 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10000 ; GFX8-NEXT: s_bfe_i32 s12, s2, 0x10013 ; GFX8-NEXT: s_bfe_i32 s13, s2, 0x10012 ; GFX8-NEXT: s_bfe_i32 s14, s2, 0x10011 @@ -3052,59 +3052,59 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i32 s20, s3, 0x10012 ; GFX8-NEXT: s_bfe_i32 s21, s3, 0x10011 ; GFX8-NEXT: s_bfe_i32 s22, s3, 0x10010 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x10017 -; GFX8-NEXT: s_bfe_i32 s9, s3, 0x10016 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10017 +; GFX8-NEXT: s_bfe_i32 s11, s3, 0x10016 ; GFX8-NEXT: s_bfe_i32 s23, s3, 0x10015 ; GFX8-NEXT: s_bfe_i32 s24, s3, 0x10014 -; GFX8-NEXT: v_mov_b32_e32 v25, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0xd0 -; GFX8-NEXT: v_mov_b32_e32 v24, s9 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v25, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v24, s11 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v22, s24 ; GFX8-NEXT: v_mov_b32_e32 v23, s23 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v22, s22 ; GFX8-NEXT: v_mov_b32_e32 v23, s21 ; GFX8-NEXT: v_mov_b32_e32 v24, s20 ; GFX8-NEXT: v_mov_b32_e32 v25, s19 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v22, s2 ; GFX8-NEXT: v_mov_b32_e32 v23, s18 ; GFX8-NEXT: v_mov_b32_e32 v24, s17 ; GFX8-NEXT: v_mov_b32_e32 v25, s16 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] -; GFX8-NEXT: v_mov_b32_e32 v27, s9 -; GFX8-NEXT: v_mov_b32_e32 v26, s8 -; GFX8-NEXT: s_add_u32 s8, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v27, s11 +; GFX8-NEXT: v_mov_b32_e32 v26, s10 +; GFX8-NEXT: s_add_u32 s10, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v22, s15 ; GFX8-NEXT: v_mov_b32_e32 v23, s14 ; GFX8-NEXT: v_mov_b32_e32 v24, s13 ; GFX8-NEXT: v_mov_b32_e32 v25, s12 -; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25] ; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v23, s9 +; GFX8-NEXT: v_mov_b32_e32 v23, s11 ; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 ; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1 ; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v22, s8 +; GFX8-NEXT: v_mov_b32_e32 v22, s10 ; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 15, s3 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s3 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s3 @@ -3135,28 +3135,29 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 -; GFX8-NEXT: v_mov_b32_e32 v13, s1 -; GFX8-NEXT: v_bfe_i32 v11, v9, 0, 1 -; GFX8-NEXT: v_bfe_i32 v10, v8, 0, 1 -; GFX8-NEXT: v_bfe_i32 v9, v6, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: v_mov_b32_e32 v12, s0 +; GFX8-NEXT: v_mov_b32_e32 v12, s1 +; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1 +; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 +; GFX8-NEXT: v_bfe_i32 v8, v7, 0, 1 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: v_mov_b32_e32 v11, s0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GFX8-NEXT: v_bfe_i32 v6, v7, 0, 1 +; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s8 ; GFX8-NEXT: v_mov_b32_e32 v8, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s8 +; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 ; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[3:6] ; GFX8-NEXT: v_bfe_i32 v8, v11, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 2, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 2, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s8 ; GFX8-NEXT: v_bfe_i32 v7, v10, 0, 1 ; GFX8-NEXT: v_bfe_i32 v11, v1, 0, 1 ; GFX8-NEXT: v_bfe_i32 v10, v0, 0, 1 @@ -3165,17 +3166,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v12, 0, 1 +; GFX8-NEXT: v_bfe_i32 v6, v13, 0, 1 ; GFX8-NEXT: v_bfe_i32 v13, v24, 0, 1 ; GFX8-NEXT: v_bfe_i32 v12, v2, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x90 -; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v25, 4, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_bfe_i32 v6, v25, 0, 1 ; GFX8-NEXT: v_bfe_i32 v12, v15, 0, 1 ; GFX8-NEXT: v_bfe_i32 v15, v19, 0, 1 ; GFX8-NEXT: v_bfe_i32 v19, v23, 0, 1 @@ -3185,8 +3185,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v22, v26, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x80 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[22:25] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -3194,11 +3194,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v14, v18, 0, 1 ; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1 ; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v18, s10 +; GFX8-NEXT: v_mov_b32_e32 v18, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -3206,7 +3206,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -3214,7 +3214,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v10, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s7 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s8 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4202,14 +4202,14 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 { ; GFX6-LABEL: constant_zextload_v8i1_to_v8i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s8, s2 -; GFX6-NEXT: s_mov_b32 s9, s3 +; GFX6-NEXT: s_mov_b32 s8, s6 +; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 @@ -4219,8 +4219,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX6-NEXT: v_mov_b32_e32 v11, v1 ; GFX6-NEXT: v_mov_b32_e32 v13, v1 ; GFX6-NEXT: v_mov_b32_e32 v15, v1 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v14, v0, 1, 1 ; GFX6-NEXT: v_bfe_u32 v10, v0, 3, 1 @@ -4230,10 +4230,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out ; GFX6-NEXT: v_bfe_u32 v8, v0, 2, 1 ; GFX6-NEXT: v_bfe_u32 v4, v0, 4, 1 ; GFX6-NEXT: v_bfe_u32 v0, v0, 6, 1 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:32 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64: @@ -4558,100 +4558,100 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v23, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, 0 +; GFX8-NEXT: v_mov_b32_e32 v21, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, v17 +; GFX8-NEXT: v_mov_b32_e32 v13, v17 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_add_u32 s4, s0, 0x50 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_mov_b32_e32 v8, s4 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, v2 -; GFX8-NEXT: v_mov_b32_e32 v17, v2 -; GFX8-NEXT: v_mov_b32_e32 v13, v2 -; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v17 +; GFX8-NEXT: v_mov_b32_e32 v5, v17 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 10, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[3:6] -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[1:4] -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 10, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 11, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 14, v2 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21] +; GFX8-NEXT: v_mov_b32_e32 v0, 1 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v18, 15, v2 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v2 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_add_u32 s2, s0, 0x60 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[1:4] -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v25, s3 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13, v0 -; GFX8-NEXT: v_mov_b32_e32 v24, s2 +; GFX8-NEXT: v_mov_b32_e32 v26, s3 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 12, v2 +; GFX8-NEXT: v_mov_b32_e32 v25, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 13, v2 +; GFX8-NEXT: v_mov_b32_e32 v20, v17 +; GFX8-NEXT: v_mov_b32_e32 v1, v17 +; GFX8-NEXT: v_mov_b32_e32 v17, s1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v22, 0xffff, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[20:23] +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v16, s0 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v0 ; GFX8-NEXT: s_add_u32 s0, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 6, v0 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 7, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 6, v2 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[19:22] ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v12, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 5, v0 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v10, 3, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_mov_b32_e32 v21, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s1 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 4, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v10, 5, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v14, 3, v2 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v22, 1, v6 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: v_mov_b32_e32 v21, s1 ; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GFX8-NEXT: v_mov_b32_e32 v20, s2 -; GFX8-NEXT: v_mov_b32_e32 v22, s0 -; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v14 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v22 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: v_mov_b32_e32 v20, s0 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[12:15] +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v16i1_to_v16i64: @@ -5131,15 +5131,15 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 11, s2 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s2 -; GFX8-NEXT: s_lshr_b32 s7, s2, 24 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v0 +; GFX8-NEXT: s_lshr_b32 s14, s2, 24 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x10018 -; GFX8-NEXT: s_and_b32 s14, s2, 1 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10018 +; GFX8-NEXT: s_and_b32 s11, s2, 1 ; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10011 ; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10010 ; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10012 @@ -5148,133 +5148,128 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10015 ; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10016 ; GFX8-NEXT: s_bfe_u32 s22, s2, 0x10017 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 14, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 14, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s2 -; GFX8-NEXT: v_and_b32_e32 v4, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e64 v5, 6, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 4, s2 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 15, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: s_add_u32 s4, s0, 0xa0 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: s_add_u32 s8, s0, 0x90 +; GFX8-NEXT: s_add_u32 s6, s0, 0x90 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: s_add_u32 s8, s0, 0x80 ; GFX8-NEXT: s_addc_u32 s9, s1, 0 -; GFX8-NEXT: s_add_u32 s10, s0, 0x80 -; GFX8-NEXT: s_addc_u32 s11, s1, 0 ; GFX8-NEXT: s_add_u32 s12, s0, 0x70 -; GFX8-NEXT: v_and_b32_e32 v11, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s7 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s14 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 ; GFX8-NEXT: v_and_b32_e32 v12, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s7 -; GFX8-NEXT: v_mov_b32_e32 v21, s13 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s14 +; GFX8-NEXT: v_mov_b32_e32 v25, s13 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v20, s12 +; GFX8-NEXT: v_mov_b32_e32 v24, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 0xf0 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX8-NEXT: v_mov_b32_e32 v17, v1 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v14 +; GFX8-NEXT: v_mov_b32_e32 v21, v1 +; GFX8-NEXT: v_mov_b32_e32 v23, v1 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v21, s13 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 1, s7 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 6, s7 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX8-NEXT: v_mov_b32_e32 v20, s12 +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v25, s13 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 6, s14 +; GFX8-NEXT: v_mov_b32_e32 v24, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 0x60 -; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v16 -; GFX8-NEXT: v_and_b32_e32 v16, 1, v22 -; GFX8-NEXT: v_lshrrev_b16_e64 v18, 7, s7 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v17 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s14 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX8-NEXT: v_mov_b32_e32 v19, s13 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v15 -; GFX8-NEXT: v_mov_b32_e32 v17, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, v1 -; GFX8-NEXT: v_mov_b32_e32 v18, s12 +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v11 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v15 +; GFX8-NEXT: v_and_b32_e32 v22, 0xffff, v16 +; GFX8-NEXT: v_mov_b32_e32 v16, s13 +; GFX8-NEXT: v_mov_b32_e32 v15, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 0x50 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[14:17] -; GFX8-NEXT: v_and_b32_e32 v18, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v10 +; GFX8-NEXT: v_mov_b32_e32 v23, 0 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: v_mov_b32_e32 v10, s13 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v16, s13 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v9 +; GFX8-NEXT: v_mov_b32_e32 v24, v1 +; GFX8-NEXT: v_mov_b32_e32 v15, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 64 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] +; GFX8-NEXT: v_mov_b32_e32 v15, 1 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v23 -; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[18:21] -; GFX8-NEXT: v_mov_b32_e32 v9, 1 -; GFX8-NEXT: v_mov_b32_e32 v23, s13 -; GFX8-NEXT: v_and_b32_sdwa v18, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v6 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, v1 -; GFX8-NEXT: v_mov_b32_e32 v22, s12 +; GFX8-NEXT: v_and_b32_sdwa v23, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v16, s13 +; GFX8-NEXT: v_mov_b32_e32 v15, s12 ; GFX8-NEXT: s_add_u32 s12, s0, 48 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21] +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v8 +; GFX8-NEXT: v_mov_b32_e32 v26, 0 ; GFX8-NEXT: s_addc_u32 s13, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v5 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] +; GFX8-NEXT: v_mov_b32_e32 v16, s13 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v26, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s12 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[18:21] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_and_b32_e32 v18, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v0 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[23:26] +; GFX8-NEXT: v_mov_b32_e32 v16, s3 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s21 ; GFX8-NEXT: v_mov_b32_e32 v2, s22 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v15, s2 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v16, s5 ; GFX8-NEXT: v_mov_b32_e32 v0, s19 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v15, s4 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v16, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_mov_b32_e32 v15, s6 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v16, s9 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v2, s15 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_mov_b32_e32 v15, s8 ; GFX8-NEXT: s_add_u32 s2, s0, 32 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v21, 0 +; GFX8-NEXT: v_mov_b32_e32 v26, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[23:26] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX8-NEXT: v_mov_b32_e32 v15, 0 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v8 -; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s14 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v7 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_and_b32_e32 v9, 1, v13 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX8-NEXT: v_mov_b32_e32 v0, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[6:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: v_mov_b32_e32 v2, v11 -; GFX8-NEXT: v_mov_b32_e32 v3, v15 +; GFX8-NEXT: v_mov_b32_e32 v2, v10 +; GFX8-NEXT: v_mov_b32_e32 v3, v13 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -5282,26 +5277,29 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s7 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v10, 1, v13 -; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX8-NEXT: v_mov_b32_e32 v13, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s14 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX8-NEXT: v_mov_b32_e32 v12, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, v1 ; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 2, s7 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s14 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v24 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX8-NEXT: v_mov_b32_e32 v17, 0 -; GFX8-NEXT: v_mov_b32_e32 v15, v1 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v18 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v20, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, v25 -; GFX8-NEXT: v_mov_b32_e32 v3, v26 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[19:22] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: v_mov_b32_e32 v3, v17 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -5475,98 +5473,98 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s52, s4, 30 -; GFX6-NEXT: s_lshr_b32 s40, s4, 31 -; GFX6-NEXT: s_lshr_b32 s42, s4, 28 -; GFX6-NEXT: s_lshr_b32 s20, s4, 29 -; GFX6-NEXT: s_lshr_b32 s24, s4, 26 -; GFX6-NEXT: s_lshr_b32 s16, s4, 27 -; GFX6-NEXT: s_lshr_b32 s22, s4, 24 -; GFX6-NEXT: s_lshr_b32 s6, s4, 25 -; GFX6-NEXT: s_lshr_b32 s8, s4, 22 -; GFX6-NEXT: s_lshr_b32 s10, s4, 23 -; GFX6-NEXT: s_lshr_b32 s12, s4, 20 -; GFX6-NEXT: s_lshr_b32 s14, s4, 21 -; GFX6-NEXT: s_lshr_b32 s18, s4, 18 -; GFX6-NEXT: s_lshr_b32 s26, s4, 19 -; GFX6-NEXT: s_lshr_b32 s28, s4, 16 -; GFX6-NEXT: s_lshr_b32 s30, s4, 17 -; GFX6-NEXT: s_lshr_b32 s34, s4, 14 -; GFX6-NEXT: s_lshr_b32 s36, s4, 15 -; GFX6-NEXT: s_lshr_b32 s38, s4, 12 -; GFX6-NEXT: s_lshr_b32 s44, s4, 13 -; GFX6-NEXT: s_lshr_b32 s46, s4, 10 -; GFX6-NEXT: s_lshr_b32 s48, s4, 11 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000 +; GFX6-NEXT: s_lshr_b32 s52, s8, 30 +; GFX6-NEXT: s_lshr_b32 s46, s8, 31 +; GFX6-NEXT: s_lshr_b32 s48, s8, 28 +; GFX6-NEXT: s_lshr_b32 s36, s8, 29 +; GFX6-NEXT: s_lshr_b32 s38, s8, 26 +; GFX6-NEXT: s_lshr_b32 s26, s8, 27 +; GFX6-NEXT: s_lshr_b32 s28, s8, 24 +; GFX6-NEXT: s_lshr_b32 s4, s8, 25 +; GFX6-NEXT: s_lshr_b32 s6, s8, 22 +; GFX6-NEXT: s_lshr_b32 s10, s8, 23 +; GFX6-NEXT: s_lshr_b32 s12, s8, 20 +; GFX6-NEXT: s_lshr_b32 s14, s8, 21 +; GFX6-NEXT: s_lshr_b32 s16, s8, 18 +; GFX6-NEXT: s_lshr_b32 s18, s8, 19 +; GFX6-NEXT: s_lshr_b32 s20, s8, 16 +; GFX6-NEXT: s_lshr_b32 s22, s8, 17 +; GFX6-NEXT: s_lshr_b32 s24, s8, 14 +; GFX6-NEXT: s_lshr_b32 s30, s8, 15 +; GFX6-NEXT: s_lshr_b32 s34, s8, 12 +; GFX6-NEXT: s_lshr_b32 s40, s8, 13 +; GFX6-NEXT: s_lshr_b32 s42, s8, 10 +; GFX6-NEXT: s_lshr_b32 s44, s8, 11 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s50 ; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: s_lshr_b32 s50, s4, 8 +; GFX6-NEXT: s_lshr_b32 s50, s8, 8 ; GFX6-NEXT: v_mov_b32_e32 v2, s52 ; GFX6-NEXT: v_mov_b32_e32 v3, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 9 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 6 -; GFX6-NEXT: v_mov_b32_e32 v6, s42 -; GFX6-NEXT: v_mov_b32_e32 v7, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 7 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v8, s20 -; GFX6-NEXT: v_mov_b32_e32 v9, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 4 -; GFX6-NEXT: v_mov_b32_e32 v10, s24 -; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 5 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v12, s16 -; GFX6-NEXT: v_mov_b32_e32 v13, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 2 -; GFX6-NEXT: v_mov_b32_e32 v14, s22 -; GFX6-NEXT: v_mov_b32_e32 v15, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 3 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: s_lshr_b32 s52, s8, 9 ; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v4, s46 +; GFX6-NEXT: v_mov_b32_e32 v5, s47 +; GFX6-NEXT: s_lshr_b32 s46, s8, 6 +; GFX6-NEXT: v_mov_b32_e32 v6, s48 +; GFX6-NEXT: v_mov_b32_e32 v7, s49 +; GFX6-NEXT: s_lshr_b32 s48, s8, 7 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s36 +; GFX6-NEXT: v_mov_b32_e32 v9, s37 +; GFX6-NEXT: s_lshr_b32 s36, s8, 4 +; GFX6-NEXT: v_mov_b32_e32 v10, s38 +; GFX6-NEXT: v_mov_b32_e32 v11, s39 +; GFX6-NEXT: s_lshr_b32 s38, s8, 5 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s26 +; GFX6-NEXT: v_mov_b32_e32 v13, s27 +; GFX6-NEXT: s_lshr_b32 s26, s8, 2 +; GFX6-NEXT: v_mov_b32_e32 v14, s28 +; GFX6-NEXT: v_mov_b32_e32 v15, s29 +; GFX6-NEXT: s_lshr_b32 s28, s8, 3 +; GFX6-NEXT: s_lshr_b32 s8, s8, 1 +; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:240 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:224 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 -; GFX6-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NEXT: v_mov_b32_e32 v17, s7 +; GFX6-NEXT: v_mov_b32_e32 v16, s4 +; GFX6-NEXT: v_mov_b32_e32 v17, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(3) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 @@ -5577,34 +5575,34 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v5, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s18 -; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: v_mov_b32_e32 v4, s26 -; GFX6-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NEXT: v_mov_b32_e32 v5, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: v_mov_b32_e32 v4, s22 +; GFX6-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v3, s25 ; GFX6-NEXT: v_mov_b32_e32 v4, s30 ; GFX6-NEXT: v_mov_b32_e32 v5, s31 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s34 ; GFX6-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NEXT: v_mov_b32_e32 v4, s36 -; GFX6-NEXT: v_mov_b32_e32 v5, s37 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 ; GFX6-NEXT: v_mov_b32_e32 v4, s44 ; GFX6-NEXT: v_mov_b32_e32 v5, s45 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:96 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s47 -; GFX6-NEXT: v_mov_b32_e32 v4, s48 -; GFX6-NEXT: v_mov_b32_e32 v5, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s50 @@ -5613,26 +5611,26 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v5, s53 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: v_mov_b32_e32 v4, s42 -; GFX6-NEXT: v_mov_b32_e32 v5, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s46 +; GFX6-NEXT: v_mov_b32_e32 v3, s47 +; GFX6-NEXT: v_mov_b32_e32 v4, s48 +; GFX6-NEXT: v_mov_b32_e32 v5, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: v_mov_b32_e32 v4, s24 -; GFX6-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: v_mov_b32_e32 v4, s38 +; GFX6-NEXT: v_mov_b32_e32 v5, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s16 -; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: v_mov_b32_e32 v4, s22 -; GFX6-NEXT: v_mov_b32_e32 v5, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v3, s27 +; GFX6-NEXT: v_mov_b32_e32 v4, s28 +; GFX6-NEXT: v_mov_b32_e32 v5, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -5640,19 +5638,19 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s8, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s10, s4, 22 -; GFX8-NEXT: s_lshr_b32 s12, s4, 23 -; GFX8-NEXT: s_lshr_b32 s14, s4, 20 -; GFX8-NEXT: s_lshr_b32 s16, s4, 21 -; GFX8-NEXT: s_lshr_b32 s18, s4, 18 -; GFX8-NEXT: s_lshr_b32 s20, s4, 19 -; GFX8-NEXT: s_lshr_b32 s22, s4, 16 -; GFX8-NEXT: s_lshr_b32 s24, s4, 17 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 +; GFX8-NEXT: s_lshr_b32 s10, s8, 22 +; GFX8-NEXT: s_lshr_b32 s12, s8, 23 +; GFX8-NEXT: s_lshr_b32 s14, s8, 20 +; GFX8-NEXT: s_lshr_b32 s16, s8, 21 +; GFX8-NEXT: s_lshr_b32 s18, s8, 18 +; GFX8-NEXT: s_lshr_b32 s20, s8, 19 +; GFX8-NEXT: s_lshr_b32 s22, s8, 16 +; GFX8-NEXT: s_lshr_b32 s24, s8, 17 +; GFX8-NEXT: s_lshr_b32 s6, s8, 24 ; GFX8-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 @@ -5693,97 +5691,101 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v16, s11 ; GFX8-NEXT: v_mov_b32_e32 v15, s10 ; GFX8-NEXT: s_add_u32 s10, s0, 0x70 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s8 ; GFX8-NEXT: v_mov_b32_e32 v11, s22 ; GFX8-NEXT: v_mov_b32_e32 v12, s23 ; GFX8-NEXT: v_mov_b32_e32 v13, s24 ; GFX8-NEXT: v_mov_b32_e32 v14, s25 ; GFX8-NEXT: s_addc_u32 s11, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s8 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 5, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 5, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s8 ; GFX8-NEXT: v_bfe_i32 v11, v10, 0, 1 ; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v14, s11 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x60 +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 0x60 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GFX8-NEXT: v_mov_b32_e32 v13, s10 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v12, s5 +; GFX8-NEXT: v_mov_b32_e32 v12, s9 ; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v11, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v11, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 0x50 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s5 +; GFX8-NEXT: v_mov_b32_e32 v10, s9 ; GFX8-NEXT: v_bfe_i32 v7, v6, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 64 +; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 64 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8] -; GFX8-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NEXT: v_mov_b32_e32 v11, s9 ; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 48 +; GFX8-NEXT: v_mov_b32_e32 v10, s8 +; GFX8-NEXT: s_add_u32 s8, s0, 48 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[3:6] ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NEXT: v_mov_b32_e32 v11, s9 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: v_mov_b32_e32 v10, s4 +; GFX8-NEXT: v_mov_b32_e32 v10, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 2, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 3, s6 ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[1:4] -; GFX8-NEXT: s_add_u32 s4, s0, 32 +; GFX8-NEXT: v_bfe_i32 v18, v16, 0, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 32 ; GFX8-NEXT: v_bfe_i32 v2, v1, 0, 1 -; GFX8-NEXT: v_bfe_i32 v18, v16, 0, 1 ; GFX8-NEXT: v_bfe_i32 v16, v0, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 16 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 ; GFX8-NEXT: v_bfe_i32 v18, v22, 0, 1 ; GFX8-NEXT: v_bfe_i32 v22, v21, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GFX8-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX8-NEXT: v_mov_b32_e32 v16, s8 -; GFX8-NEXT: v_mov_b32_e32 v17, s9 +; GFX8-NEXT: v_mov_b32_e32 v16, s4 +; GFX8-NEXT: v_mov_b32_e32 v17, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_add_u32 s4, s0, 0xf0 ; GFX8-NEXT: v_bfe_i32 v14, v13, 0, 1 @@ -5795,8 +5797,6 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -5804,8 +5804,6 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v8, v8, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 3, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 @@ -6031,50 +6029,50 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s4, s2, 0x10003 ; GFX6-NEXT: s_bfe_u32 s5, s2, 0x10005 -; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10007 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0x10009 -; GFX6-NEXT: s_bfe_u32 s12, s2, 0x1000b -; GFX6-NEXT: s_bfe_u32 s14, s2, 0x1000d -; GFX6-NEXT: s_bfe_u32 s16, s2, 0x1000f -; GFX6-NEXT: s_bfe_u32 s18, s2, 0x10011 -; GFX6-NEXT: s_bfe_u32 s20, s2, 0x10013 -; GFX6-NEXT: s_bfe_u32 s22, s2, 0x10015 -; GFX6-NEXT: s_bfe_u32 s24, s2, 0x10017 -; GFX6-NEXT: s_bfe_u32 s25, s2, 0x10019 -; GFX6-NEXT: s_bfe_u32 s26, s2, 0x1001b -; GFX6-NEXT: s_bfe_u32 s27, s2, 0x1001d -; GFX6-NEXT: s_lshr_b32 s28, s2, 31 -; GFX6-NEXT: s_bfe_u32 s29, s3, 0x10003 -; GFX6-NEXT: s_bfe_u32 s30, s3, 0x10005 -; GFX6-NEXT: s_bfe_u32 s31, s3, 0x10007 -; GFX6-NEXT: s_bfe_u32 s33, s3, 0x10009 -; GFX6-NEXT: s_bfe_u32 s34, s3, 0x1000b -; GFX6-NEXT: s_bfe_u32 s35, s3, 0x1000d -; GFX6-NEXT: s_bfe_u32 s36, s3, 0x1000f -; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10011 -; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10013 -; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10015 -; GFX6-NEXT: s_bfe_u32 s40, s3, 0x10017 -; GFX6-NEXT: s_bfe_u32 s41, s3, 0x10019 -; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1001b -; GFX6-NEXT: s_bfe_u32 s43, s3, 0x1001d -; GFX6-NEXT: s_lshr_b32 s44, s3, 31 +; GFX6-NEXT: s_bfe_u32 s8, s2, 0x10007 +; GFX6-NEXT: s_bfe_u32 s11, s2, 0x10009 +; GFX6-NEXT: s_bfe_u32 s13, s2, 0x1000b +; GFX6-NEXT: s_bfe_u32 s15, s2, 0x1000d +; GFX6-NEXT: s_bfe_u32 s17, s2, 0x1000f +; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10011 +; GFX6-NEXT: s_bfe_u32 s21, s2, 0x10013 +; GFX6-NEXT: s_bfe_u32 s23, s2, 0x10015 +; GFX6-NEXT: s_bfe_u32 s25, s2, 0x10017 +; GFX6-NEXT: s_bfe_u32 s27, s2, 0x10019 +; GFX6-NEXT: s_bfe_u32 s29, s2, 0x1001b +; GFX6-NEXT: s_bfe_u32 s31, s2, 0x1001d +; GFX6-NEXT: s_lshr_b32 s34, s2, 31 +; GFX6-NEXT: s_bfe_u32 s36, s3, 0x10003 +; GFX6-NEXT: s_bfe_u32 s37, s3, 0x10005 +; GFX6-NEXT: s_bfe_u32 s38, s3, 0x10007 +; GFX6-NEXT: s_bfe_u32 s39, s3, 0x10009 +; GFX6-NEXT: s_bfe_u32 s40, s3, 0x1000b +; GFX6-NEXT: s_bfe_u32 s41, s3, 0x1000d +; GFX6-NEXT: s_bfe_u32 s42, s3, 0x1000f +; GFX6-NEXT: s_bfe_u32 s43, s3, 0x10011 +; GFX6-NEXT: s_bfe_u32 s44, s3, 0x10013 +; GFX6-NEXT: s_bfe_u32 s45, s3, 0x10015 +; GFX6-NEXT: s_bfe_u32 s46, s3, 0x10017 +; GFX6-NEXT: s_bfe_u32 s47, s3, 0x10019 +; GFX6-NEXT: s_bfe_u32 s48, s3, 0x1001b +; GFX6-NEXT: s_bfe_u32 s49, s3, 0x1001d +; GFX6-NEXT: s_lshr_b32 s50, s3, 31 ; GFX6-NEXT: s_bfe_u32 s9, s3, 0x10001 -; GFX6-NEXT: s_bfe_u32 s7, s2, 0x10001 -; GFX6-NEXT: s_and_b32 s8, s2, 1 -; GFX6-NEXT: s_and_b32 s11, s3, 1 -; GFX6-NEXT: s_bfe_u32 s13, s2, 0x10002 -; GFX6-NEXT: s_bfe_u32 s15, s2, 0x10004 -; GFX6-NEXT: s_bfe_u32 s17, s2, 0x10006 -; GFX6-NEXT: s_bfe_u32 s19, s2, 0x10008 -; GFX6-NEXT: s_bfe_u32 s21, s2, 0x1000a -; GFX6-NEXT: s_bfe_u32 s23, s2, 0x1000c -; GFX6-NEXT: s_bfe_u32 s45, s2, 0x1000e -; GFX6-NEXT: s_bfe_u32 s46, s2, 0x10010 -; GFX6-NEXT: s_bfe_u32 s47, s2, 0x10012 -; GFX6-NEXT: s_bfe_u32 s48, s2, 0x10014 -; GFX6-NEXT: s_bfe_u32 s49, s2, 0x10016 -; GFX6-NEXT: s_bfe_u32 s50, s2, 0x10018 +; GFX6-NEXT: s_bfe_u32 s6, s2, 0x10001 +; GFX6-NEXT: s_and_b32 s7, s2, 1 +; GFX6-NEXT: s_and_b32 s10, s3, 1 +; GFX6-NEXT: s_bfe_u32 s12, s2, 0x10002 +; GFX6-NEXT: s_bfe_u32 s14, s2, 0x10004 +; GFX6-NEXT: s_bfe_u32 s16, s2, 0x10006 +; GFX6-NEXT: s_bfe_u32 s18, s2, 0x10008 +; GFX6-NEXT: s_bfe_u32 s20, s2, 0x1000a +; GFX6-NEXT: s_bfe_u32 s22, s2, 0x1000c +; GFX6-NEXT: s_bfe_u32 s24, s2, 0x1000e +; GFX6-NEXT: s_bfe_u32 s26, s2, 0x10010 +; GFX6-NEXT: s_bfe_u32 s28, s2, 0x10012 +; GFX6-NEXT: s_bfe_u32 s30, s2, 0x10014 +; GFX6-NEXT: s_bfe_u32 s33, s2, 0x10016 +; GFX6-NEXT: s_bfe_u32 s35, s2, 0x10018 ; GFX6-NEXT: s_bfe_u32 s51, s2, 0x1001a ; GFX6-NEXT: s_bfe_u32 s52, s2, 0x1001c ; GFX6-NEXT: s_bfe_u32 s53, s2, 0x1001e @@ -6098,138 +6096,138 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: v_mov_b32_e32 v0, s67 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v2, s50 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s68 -; GFX6-NEXT: v_mov_b32_e32 v2, s43 +; GFX6-NEXT: v_mov_b32_e32 v2, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:464 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NEXT: v_mov_b32_e32 v2, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s47 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:448 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NEXT: v_mov_b32_e32 v2, s46 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NEXT: v_mov_b32_e32 v2, s39 +; GFX6-NEXT: v_mov_b32_e32 v2, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:416 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NEXT: v_mov_b32_e32 v2, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s43 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NEXT: v_mov_b32_e32 v2, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s41 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NEXT: v_mov_b32_e32 v2, s33 +; GFX6-NEXT: v_mov_b32_e32 v2, s39 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s56 -; GFX6-NEXT: v_mov_b32_e32 v2, s31 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s55 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v2, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s54 -; GFX6-NEXT: v_mov_b32_e32 v2, s29 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s53 -; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s52 -; GFX6-NEXT: v_mov_b32_e32 v2, s27 +; GFX6-NEXT: v_mov_b32_e32 v2, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s26 +; GFX6-NEXT: v_mov_b32_e32 v2, s29 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s50 -; GFX6-NEXT: v_mov_b32_e32 v2, s25 +; GFX6-NEXT: v_mov_b32_e32 v0, s35 +; GFX6-NEXT: v_mov_b32_e32 v2, s27 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s49 -; GFX6-NEXT: v_mov_b32_e32 v2, s24 +; GFX6-NEXT: v_mov_b32_e32 v0, s33 +; GFX6-NEXT: v_mov_b32_e32 v2, s25 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 +; GFX6-NEXT: v_mov_b32_e32 v0, s30 +; GFX6-NEXT: v_mov_b32_e32 v2, s23 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s47 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v0, s28 +; GFX6-NEXT: v_mov_b32_e32 v2, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s46 -; GFX6-NEXT: v_mov_b32_e32 v2, s18 +; GFX6-NEXT: v_mov_b32_e32 v0, s26 +; GFX6-NEXT: v_mov_b32_e32 v2, s19 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s45 -; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v0, s24 +; GFX6-NEXT: v_mov_b32_e32 v2, s17 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s23 -; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s22 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s21 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s19 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: v_mov_b32_e32 v2, s11 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NEXT: v_mov_b32_e32 v2, s9 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v25, 1 +; GFX8-NEXT: v_mov_b32_e32 v28, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6244,28 +6242,28 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2 ; GFX8-NEXT: v_and_b32_e32 v10, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2 -; GFX8-NEXT: v_mov_b32_e32 v11, s2 +; GFX8-NEXT: v_mov_b32_e32 v12, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2 -; GFX8-NEXT: s_lshr_b32 s24, s3, 24 -; GFX8-NEXT: s_lshr_b32 s22, s2, 24 +; GFX8-NEXT: s_lshr_b32 s31, s3, 24 +; GFX8-NEXT: s_lshr_b32 s24, s2, 24 ; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s2 -; GFX8-NEXT: v_and_b32_e32 v8, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2 ; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10018 ; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10018 -; GFX8-NEXT: s_and_b32 s23, s3, 1 -; GFX8-NEXT: s_and_b32 s25, s2, 1 +; GFX8-NEXT: s_and_b32 s22, s3, 1 +; GFX8-NEXT: s_and_b32 s23, s2, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2 -; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10011 -; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10010 -; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10012 -; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10013 -; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10014 -; GFX8-NEXT: s_bfe_u32 s31, s2, 0x10015 +; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10011 +; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10010 +; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10012 +; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10013 +; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10014 +; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10015 ; GFX8-NEXT: s_bfe_u32 s33, s2, 0x10016 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10017 ; GFX8-NEXT: s_bfe_u32 s34, s3, 0x10011 @@ -6292,285 +6290,292 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: s_add_u32 s18, s0, 0x80 ; GFX8-NEXT: s_addc_u32 s19, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v9, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s3 ; GFX8-NEXT: s_add_u32 s42, s0, 0x70 -; GFX8-NEXT: v_and_b32_e32 v7, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v23, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v24, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x170 +; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3 ; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[2:5] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v24, s42 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v22 -; GFX8-NEXT: v_mov_b32_e32 v22, s42 ; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s3 -; GFX8-NEXT: v_mov_b32_e32 v23, s43 -; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s24 -; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v25, s43 +; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[2:5] ; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0 ; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s3 -; GFX8-NEXT: v_and_b32_e32 v23, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v21 +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s31 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v21, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s22 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 7, s24 -; GFX8-NEXT: v_mov_b32_e32 v22, s43 +; GFX8-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v21 +; GFX8-NEXT: v_lshrrev_b16_e64 v26, 7, s31 +; GFX8-NEXT: v_mov_b32_e32 v25, v1 +; GFX8-NEXT: v_mov_b32_e32 v27, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0xf0 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s24 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v20 -; GFX8-NEXT: v_mov_b32_e32 v20, s42 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 7, s22 -; GFX8-NEXT: v_mov_b32_e32 v21, s43 -; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v20 +; GFX8-NEXT: v_lshrrev_b16_e64 v26, 7, s24 +; GFX8-NEXT: v_mov_b32_e32 v25, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x60 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3 -; GFX8-NEXT: v_and_b32_e32 v20, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3 -; GFX8-NEXT: v_and_b32_e32 v21, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v19 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v18 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v19 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v18 ; GFX8-NEXT: v_mov_b32_e32 v18, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, 0 ; GFX8-NEXT: v_mov_b32_e32 v19, s43 -; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[2:5] ; GFX8-NEXT: s_add_u32 s42, s0, 0x50 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 5, s3 -; GFX8-NEXT: v_and_b32_e32 v19, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v17 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v17 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v16 ; GFX8-NEXT: v_mov_b32_e32 v16, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, 0 ; GFX8-NEXT: v_mov_b32_e32 v17, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 64 -; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[2:5] +; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[24:27] +; GFX8-NEXT: v_mov_b32_e32 v19, 1 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v15 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v15 ; GFX8-NEXT: v_mov_b32_e32 v15, s42 -; GFX8-NEXT: v_and_b32_sdwa v2, v11, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_and_b32_sdwa v24, v12, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v27, 0 ; GFX8-NEXT: v_mov_b32_e32 v16, s43 -; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[2:5] ; GFX8-NEXT: s_add_u32 s42, s0, 48 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s3 -; GFX8-NEXT: v_and_b32_e32 v15, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v14 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, s43 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[2:5] +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v13 +; GFX8-NEXT: v_mov_b32_e32 v12, s42 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v14 +; GFX8-NEXT: v_mov_b32_e32 v27, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 32 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 1, s3 -; GFX8-NEXT: v_and_b32_e32 v26, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 5, s24 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[24:27] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v24, 1, v11 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff, v10 ; GFX8-NEXT: v_mov_b32_e32 v10, s42 -; GFX8-NEXT: v_and_b32_e32 v27, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v12 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v27, 0 ; GFX8-NEXT: v_mov_b32_e32 v11, s43 -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 5, s31 +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[24:27] +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s31 ; GFX8-NEXT: s_add_u32 s42, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s24 -; GFX8-NEXT: v_and_b32_e32 v29, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v9 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v12 +; GFX8-NEXT: v_and_b32_e32 v25, 1, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v7 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s42 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, s43 -; GFX8-NEXT: s_add_u32 s42, s0, 0x160 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 12, s3 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[2:5] -; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 1, s24 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v24 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, s43 +; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s31 +; GFX8-NEXT: s_add_u32 s42, s0, 0x160 +; GFX8-NEXT: v_lshrrev_b16_e64 v23, 12, s3 +; GFX8-NEXT: v_and_b32_e32 v27, 1, v7 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s42 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v23 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s43 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 5, s24 ; GFX8-NEXT: s_add_u32 s42, s0, 0x150 -; GFX8-NEXT: v_lshrrev_b16_e64 v22, 10, s3 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s3 +; GFX8-NEXT: v_and_b32_e32 v23, 1, v6 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 5, s22 -; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_and_b32_e32 v14, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v22 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s43 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: v_mov_b32_e32 v6, s42 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v21 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v4 +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, s43 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[10:13] +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s24 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 ; GFX8-NEXT: s_add_u32 s42, s0, 0x140 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s22 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v2 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 1, s22 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v20, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v6 ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v2 -; GFX8-NEXT: v_and_b32_sdwa v2, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v20 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s43 +; GFX8-NEXT: v_mov_b32_e32 v6, s42 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3 +; GFX8-NEXT: v_and_b32_sdwa v19, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v20, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x130 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s3 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s3 +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[19:22] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v7, s42 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v18 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v21 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, s43 +; GFX8-NEXT: v_mov_b32_e32 v6, s42 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v2 +; GFX8-NEXT: v_mov_b32_e32 v21, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x120 ; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s3 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s22 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[2:5] +; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[18:21] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v5, 1, v10 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v17 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v19 -; GFX8-NEXT: v_mov_b32_e32 v12, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s3 +; GFX8-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX8-NEXT: v_mov_b32_e32 v20, 0 +; GFX8-NEXT: v_mov_b32_e32 v18, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NEXT: s_add_u32 s42, s0, 0x110 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v5 ; GFX8-NEXT: v_lshrrev_b16_e64 v16, 2, s3 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s24 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20] ; GFX8-NEXT: s_addc_u32 s43, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 -; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v14 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v16 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX8-NEXT: v_mov_b32_e32 v16, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s31 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX8-NEXT: v_mov_b32_e32 v19, 0 +; GFX8-NEXT: v_mov_b32_e32 v17, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: v_mov_b32_e32 v22, s5 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v23 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19] ; GFX8-NEXT: v_mov_b32_e32 v0, s41 +; GFX8-NEXT: v_and_b32_e32 v19, 1, v24 +; GFX8-NEXT: v_mov_b32_e32 v24, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s40 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v21, s4 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s7 +; GFX8-NEXT: v_mov_b32_e32 v23, s4 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s38 ; GFX8-NEXT: v_mov_b32_e32 v2, s39 -; GFX8-NEXT: v_mov_b32_e32 v21, s6 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s9 +; GFX8-NEXT: v_mov_b32_e32 v23, s6 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s9 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v2, s37 -; GFX8-NEXT: v_mov_b32_e32 v21, s8 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s11 +; GFX8-NEXT: v_mov_b32_e32 v23, s8 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s11 ; GFX8-NEXT: v_mov_b32_e32 v0, s35 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v21, s10 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s13 +; GFX8-NEXT: v_mov_b32_e32 v23, s10 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s13 ; GFX8-NEXT: v_mov_b32_e32 v0, s33 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v21, s12 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NEXT: v_mov_b32_e32 v2, s31 -; GFX8-NEXT: v_mov_b32_e32 v21, s14 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s17 -; GFX8-NEXT: v_mov_b32_e32 v0, s28 -; GFX8-NEXT: v_mov_b32_e32 v2, s29 -; GFX8-NEXT: v_mov_b32_e32 v21, s16 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s19 +; GFX8-NEXT: v_mov_b32_e32 v23, s12 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s15 +; GFX8-NEXT: v_mov_b32_e32 v0, s29 +; GFX8-NEXT: v_mov_b32_e32 v2, s30 +; GFX8-NEXT: v_mov_b32_e32 v23, s14 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s17 ; GFX8-NEXT: v_mov_b32_e32 v0, s27 -; GFX8-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NEXT: v_mov_b32_e32 v21, s18 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NEXT: v_mov_b32_e32 v23, s16 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s19 +; GFX8-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NEXT: v_mov_b32_e32 v2, s25 +; GFX8-NEXT: v_mov_b32_e32 v23, s18 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 1, s3 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s1 ; GFX8-NEXT: s_add_u32 s2, s0, 0x100 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s25 -; GFX8-NEXT: v_mov_b32_e32 v21, s0 -; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v26 -; GFX8-NEXT: v_mov_b32_e32 v18, 0 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v22, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s23 -; GFX8-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-NEXT: v_mov_b32_e32 v3, v18 -; GFX8-NEXT: v_mov_b32_e32 v21, s2 +; GFX8-NEXT: v_mov_b32_e32 v23, s0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v14 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX8-NEXT: v_mov_b32_e32 v14, 0 +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v24, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, v14 +; GFX8-NEXT: v_mov_b32_e32 v23, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 -; GFX8-NEXT: v_lshrrev_b16_e64 v28, 4, s24 -; GFX8-NEXT: flat_store_dwordx4 v[21:22], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_and_b32_e32 v17, 1, v28 -; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v27 -; GFX8-NEXT: v_mov_b32_e32 v20, 0 -; GFX8-NEXT: v_mov_b32_e32 v18, v1 +; GFX8-NEXT: v_mov_b32_e32 v22, 0 +; GFX8-NEXT: v_mov_b32_e32 v20, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20] +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[19:22] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v30, 2, s24 +; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s31 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 -; GFX8-NEXT: v_and_b32_e32 v13, 1, v30 -; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v29 -; GFX8-NEXT: v_mov_b32_e32 v16, 0 -; GFX8-NEXT: v_mov_b32_e32 v14, v1 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v26 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v25 +; GFX8-NEXT: v_mov_b32_e32 v18, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, v1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GFX8-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18] ; GFX8-NEXT: v_mov_b32_e32 v0, s21 -; GFX8-NEXT: v_mov_b32_e32 v14, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, v10 -; GFX8-NEXT: v_mov_b32_e32 v13, s2 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, v27 +; GFX8-NEXT: v_mov_b32_e32 v3, v28 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 ; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 -; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s22 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 4, s24 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 -; GFX8-NEXT: v_and_b32_e32 v9, 1, v24 -; GFX8-NEXT: v_mov_b32_e32 v12, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, v1 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s24 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[8:11] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_add_u32 s0, s0, 0xc0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v25, 0 -; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[5:8] +; GFX8-NEXT: v_mov_b32_e32 v13, 0 +; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, v23 -; GFX8-NEXT: v_mov_b32_e32 v3, v25 +; GFX8-NEXT: v_mov_b32_e32 v2, v12 +; GFX8-NEXT: v_mov_b32_e32 v3, v13 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -6582,12 +6587,12 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: ALU 95, @41, KC0[], KC1[] ; EG-NEXT: ALU 99, @137, KC0[CB0:0-32], KC1[] ; EG-NEXT: ALU 60, @237, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T82.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T81.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T80.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T79.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T78.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T77.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T82.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T81.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T80.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T79.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T78.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T77.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T76.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T75.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T74.X, 0 @@ -6613,149 +6618,149 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T54.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T53.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T52.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T51.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T51.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 38: -; EG-NEXT: VTX_READ_64 T19.XY, T19.X, 0, #1 +; EG-NEXT: VTX_READ_64 T25.XY, T19.X, 0, #1 ; EG-NEXT: ALU clause starting at 40: ; EG-NEXT: MOV * T19.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 41: -; EG-NEXT: LSHR * T20.Z, T19.Y, literal.x, +; EG-NEXT: LSHR * T19.Z, T25.Y, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T20.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T20.Y, 0.0, -; EG-NEXT: BFE_UINT * T21.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T19.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T19.Y, 0.0, +; EG-NEXT: BFE_UINT * T20.Z, T25.Y, literal.y, 1, ; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T21.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T21.Y, 0.0, -; EG-NEXT: BFE_UINT * T22.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T20.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T20.Y, 0.0, +; EG-NEXT: BFE_UINT * T21.Z, T25.Y, literal.y, 1, ; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44) -; EG-NEXT: BFE_UINT T22.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T22.Y, 0.0, -; EG-NEXT: BFE_UINT * T23.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T21.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T21.Y, 0.0, +; EG-NEXT: BFE_UINT * T22.Z, T25.Y, literal.y, 1, ; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44) -; EG-NEXT: BFE_UINT T23.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T23.Y, 0.0, -; EG-NEXT: BFE_UINT * T24.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T22.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T22.Y, 0.0, +; EG-NEXT: BFE_UINT * T23.Z, T25.Y, literal.y, 1, ; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44) -; EG-NEXT: BFE_UINT T24.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T24.Y, 0.0, -; EG-NEXT: BFE_UINT * T25.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T23.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T23.Y, 0.0, +; EG-NEXT: BFE_UINT * T24.Z, T25.Y, literal.y, 1, ; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44) -; EG-NEXT: BFE_UINT T25.X, T19.Y, literal.x, 1, -; EG-NEXT: MOV T25.Y, 0.0, -; EG-NEXT: BFE_UINT * T26.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT T24.X, T25.Y, literal.x, 1, +; EG-NEXT: MOV T24.Y, 0.0, +; EG-NEXT: BFE_UINT * T26.Z, T25.Y, literal.y, 1, ; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44) -; EG-NEXT: BFE_UINT T26.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T26.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T26.Y, 0.0, -; EG-NEXT: BFE_UINT * T27.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T27.Z, T25.Y, literal.y, 1, ; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44) -; EG-NEXT: BFE_UINT T27.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T27.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T27.Y, 0.0, -; EG-NEXT: BFE_UINT * T28.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T28.Z, T25.Y, literal.y, 1, ; EG-NEXT: 16(2.242078e-44), 15(2.101948e-44) -; EG-NEXT: BFE_UINT T28.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T28.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T28.Y, 0.0, -; EG-NEXT: BFE_UINT * T29.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T29.Z, T25.Y, literal.y, 1, ; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44) -; EG-NEXT: BFE_UINT T29.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T29.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T29.Y, 0.0, -; EG-NEXT: BFE_UINT * T30.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T30.Z, T25.Y, literal.y, 1, ; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T30.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T30.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T30.Y, 0.0, -; EG-NEXT: BFE_UINT * T31.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T31.Z, T25.Y, literal.y, 1, ; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44) -; EG-NEXT: BFE_UINT T31.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T31.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T31.Y, 0.0, -; EG-NEXT: BFE_UINT * T32.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T32.Z, T25.Y, literal.y, 1, ; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T32.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T32.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T32.Y, 0.0, -; EG-NEXT: BFE_UINT * T33.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T33.Z, T25.Y, literal.y, 1, ; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45) -; EG-NEXT: BFE_UINT T33.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T33.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T33.Y, 0.0, -; EG-NEXT: BFE_UINT * T34.Z, T19.Y, literal.y, 1, +; EG-NEXT: BFE_UINT * T34.Z, T25.Y, literal.y, 1, ; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) -; EG-NEXT: BFE_UINT T34.X, T19.Y, literal.x, 1, +; EG-NEXT: BFE_UINT T34.X, T25.Y, literal.x, 1, ; EG-NEXT: MOV T34.Y, 0.0, -; EG-NEXT: BFE_UINT T35.Z, T19.Y, 1, 1, -; EG-NEXT: AND_INT * T35.X, T19.Y, 1, +; EG-NEXT: BFE_UINT T35.Z, T25.Y, 1, 1, +; EG-NEXT: AND_INT * T35.X, T25.Y, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV T35.Y, 0.0, -; EG-NEXT: LSHR * T36.Z, T19.X, literal.x, +; EG-NEXT: LSHR * T36.Z, T25.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T36.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T36.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T36.Y, 0.0, -; EG-NEXT: BFE_UINT * T37.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T37.Z, T25.X, literal.y, 1, ; EG-NEXT: 30(4.203895e-44), 29(4.063766e-44) -; EG-NEXT: BFE_UINT T37.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T37.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T37.Y, 0.0, -; EG-NEXT: BFE_UINT * T38.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T38.Z, T25.X, literal.y, 1, ; EG-NEXT: 28(3.923636e-44), 27(3.783506e-44) -; EG-NEXT: BFE_UINT T38.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T38.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T38.Y, 0.0, -; EG-NEXT: BFE_UINT * T39.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T39.Z, T25.X, literal.y, 1, ; EG-NEXT: 26(3.643376e-44), 25(3.503246e-44) -; EG-NEXT: BFE_UINT T39.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T39.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T39.Y, 0.0, -; EG-NEXT: BFE_UINT * T40.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T40.Z, T25.X, literal.y, 1, ; EG-NEXT: 24(3.363116e-44), 23(3.222986e-44) -; EG-NEXT: BFE_UINT T40.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T40.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T40.Y, 0.0, -; EG-NEXT: BFE_UINT * T41.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T41.Z, T25.X, literal.y, 1, ; EG-NEXT: 22(3.082857e-44), 21(2.942727e-44) -; EG-NEXT: BFE_UINT T41.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T41.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T41.Y, 0.0, -; EG-NEXT: BFE_UINT * T42.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T42.Z, T25.X, literal.y, 1, ; EG-NEXT: 20(2.802597e-44), 19(2.662467e-44) -; EG-NEXT: BFE_UINT T42.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T42.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T42.Y, 0.0, -; EG-NEXT: BFE_UINT * T43.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T43.Z, T25.X, literal.y, 1, ; EG-NEXT: 18(2.522337e-44), 17(2.382207e-44) -; EG-NEXT: BFE_UINT * T43.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T43.X, T25.X, literal.x, 1, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 137: ; EG-NEXT: MOV T43.Y, 0.0, -; EG-NEXT: BFE_UINT * T44.Z, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT * T44.Z, T25.X, literal.x, 1, ; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T44.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T44.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T44.Y, 0.0, -; EG-NEXT: BFE_UINT * T45.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T45.Z, T25.X, literal.y, 1, ; EG-NEXT: 14(1.961818e-44), 13(1.821688e-44) -; EG-NEXT: BFE_UINT T45.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T45.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T45.Y, 0.0, -; EG-NEXT: BFE_UINT * T46.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T46.Z, T25.X, literal.y, 1, ; EG-NEXT: 12(1.681558e-44), 11(1.541428e-44) -; EG-NEXT: BFE_UINT T46.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T46.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T46.Y, 0.0, -; EG-NEXT: BFE_UINT * T47.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T47.Z, T25.X, literal.y, 1, ; EG-NEXT: 10(1.401298e-44), 9(1.261169e-44) -; EG-NEXT: BFE_UINT T47.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T47.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T47.Y, 0.0, -; EG-NEXT: BFE_UINT * T48.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T48.Z, T25.X, literal.y, 1, ; EG-NEXT: 8(1.121039e-44), 7(9.809089e-45) -; EG-NEXT: BFE_UINT T48.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T48.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T48.Y, 0.0, -; EG-NEXT: BFE_UINT * T49.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T49.Z, T25.X, literal.y, 1, ; EG-NEXT: 6(8.407791e-45), 5(7.006492e-45) -; EG-NEXT: BFE_UINT T49.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T49.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T49.Y, 0.0, -; EG-NEXT: BFE_UINT * T50.Z, T19.X, literal.y, 1, +; EG-NEXT: BFE_UINT * T50.Z, T25.X, literal.y, 1, ; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) -; EG-NEXT: BFE_UINT T50.X, T19.X, literal.x, 1, +; EG-NEXT: BFE_UINT T50.X, T25.X, literal.x, 1, ; EG-NEXT: MOV T50.Y, 0.0, -; EG-NEXT: BFE_UINT T19.Z, T19.X, 1, 1, -; EG-NEXT: AND_INT * T19.X, T19.X, 1, +; EG-NEXT: BFE_UINT T25.Z, T25.X, 1, 1, +; EG-NEXT: AND_INT * T25.X, T25.X, 1, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: MOV T19.Y, 0.0, -; EG-NEXT: MOV T20.W, 0.0, -; EG-NEXT: MOV * T21.W, 0.0, -; EG-NEXT: MOV T22.W, 0.0, -; EG-NEXT: MOV * T23.W, 0.0, -; EG-NEXT: MOV T24.W, 0.0, -; EG-NEXT: MOV * T25.W, 0.0, +; EG-NEXT: MOV T25.Y, 0.0, +; EG-NEXT: MOV T19.W, 0.0, +; EG-NEXT: MOV * T20.W, 0.0, +; EG-NEXT: MOV T21.W, 0.0, +; EG-NEXT: MOV * T22.W, 0.0, +; EG-NEXT: MOV T23.W, 0.0, +; EG-NEXT: MOV * T24.W, 0.0, ; EG-NEXT: MOV T26.W, 0.0, ; EG-NEXT: MOV * T27.W, 0.0, ; EG-NEXT: MOV T28.W, 0.0, @@ -6781,7 +6786,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MOV T48.W, 0.0, ; EG-NEXT: MOV * T49.W, 0.0, ; EG-NEXT: MOV T50.W, 0.0, -; EG-NEXT: MOV * T19.W, 0.0, +; EG-NEXT: MOV * T25.W, 0.0, ; EG-NEXT: LSHR T51.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -6896,11 +6901,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s48, s5, 30 ; GFX6-NEXT: s_lshr_b32 s46, s5, 28 -; GFX6-NEXT: s_lshr_b32 s42, s5, 29 -; GFX6-NEXT: s_lshr_b32 s38, s5, 26 -; GFX6-NEXT: s_lshr_b32 s44, s5, 27 +; GFX6-NEXT: s_lshr_b32 s44, s5, 29 +; GFX6-NEXT: s_lshr_b32 s40, s5, 26 +; GFX6-NEXT: s_lshr_b32 s42, s5, 27 ; GFX6-NEXT: s_lshr_b32 s36, s5, 24 -; GFX6-NEXT: s_lshr_b32 s40, s5, 25 +; GFX6-NEXT: s_lshr_b32 s38, s5, 25 ; GFX6-NEXT: s_lshr_b32 s30, s5, 22 ; GFX6-NEXT: s_lshr_b32 s34, s5, 23 ; GFX6-NEXT: s_lshr_b32 s26, s5, 20 @@ -6925,38 +6930,38 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v5, s53 ; GFX6-NEXT: s_lshr_b32 s52, s5, 9 ; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[46:47], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v6, s48 ; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s5, 6 -; GFX6-NEXT: v_mov_b32_e32 v10, s46 -; GFX6-NEXT: v_mov_b32_e32 v11, s47 -; GFX6-NEXT: s_lshr_b32 s46, s5, 7 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: v_mov_b32_e32 v12, s42 -; GFX6-NEXT: v_mov_b32_e32 v13, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 4 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_lshr_b32 s46, s5, 6 +; GFX6-NEXT: v_mov_b32_e32 v10, s54 +; GFX6-NEXT: v_mov_b32_e32 v11, s55 +; GFX6-NEXT: s_lshr_b32 s48, s5, 7 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 -; GFX6-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NEXT: s_lshr_b32 s54, s5, 5 -; GFX6-NEXT: v_mov_b32_e32 v16, s44 -; GFX6-NEXT: v_mov_b32_e32 v17, s45 -; GFX6-NEXT: s_lshr_b32 s38, s5, 2 +; GFX6-NEXT: s_ashr_i32 s7, s5, 31 +; GFX6-NEXT: v_mov_b32_e32 v12, s44 +; GFX6-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NEXT: s_lshr_b32 s44, s5, 4 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v14, s40 +; GFX6-NEXT: v_mov_b32_e32 v15, s41 +; GFX6-NEXT: s_lshr_b32 s42, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v16, s54 +; GFX6-NEXT: v_mov_b32_e32 v17, s55 +; GFX6-NEXT: s_lshr_b32 s40, s5, 2 ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v9, s7 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:496 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v6, s36 ; GFX6-NEXT: v_mov_b32_e32 v7, s37 ; GFX6-NEXT: s_lshr_b32 s36, s5, 3 -; GFX6-NEXT: v_mov_b32_e32 v8, s40 -; GFX6-NEXT: v_mov_b32_e32 v9, s41 -; GFX6-NEXT: s_lshr_b32 s40, s5, 1 +; GFX6-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:480 @@ -6987,15 +6992,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v8, s24 ; GFX6-NEXT: v_mov_b32_e32 v9, s25 ; GFX6-NEXT: s_lshr_b32 s24, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:432 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v10, s18 ; GFX6-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NEXT: s_lshr_b32 s44, s4, 24 -; GFX6-NEXT: v_mov_b32_e32 v12, s20 -; GFX6-NEXT: v_mov_b32_e32 v13, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 24 +; GFX6-NEXT: v_mov_b32_e32 v12, s54 +; GFX6-NEXT: v_mov_b32_e32 v13, s55 ; GFX6-NEXT: s_lshr_b32 s18, s4, 25 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 @@ -7027,50 +7032,50 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v12, s8 ; GFX6-NEXT: v_mov_b32_e32 v13, s9 ; GFX6-NEXT: s_lshr_b32 s8, s4, 19 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v14, s50 ; GFX6-NEXT: v_mov_b32_e32 v15, s51 ; GFX6-NEXT: s_lshr_b32 s50, s4, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 17 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s52 +; GFX6-NEXT: v_mov_b32_e32 v17, s53 +; GFX6-NEXT: s_lshr_b32 s52, s4, 17 ; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, s48 -; GFX6-NEXT: v_mov_b32_e32 v7, s49 -; GFX6-NEXT: s_lshr_b32 s48, s4, 14 -; GFX6-NEXT: v_mov_b32_e32 v8, s46 -; GFX6-NEXT: v_mov_b32_e32 v9, s47 -; GFX6-NEXT: s_lshr_b32 s46, s4, 15 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[54:55], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v6, s46 +; GFX6-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NEXT: s_lshr_b32 s46, s4, 14 +; GFX6-NEXT: v_mov_b32_e32 v8, s48 +; GFX6-NEXT: v_mov_b32_e32 v9, s49 +; GFX6-NEXT: s_lshr_b32 s48, s4, 15 +; GFX6-NEXT: s_bfe_i64 s[54:55], s[42:43], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[44:45], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v10, s42 ; GFX6-NEXT: v_mov_b32_e32 v11, s43 ; GFX6-NEXT: s_lshr_b32 s42, s4, 12 -; GFX6-NEXT: v_mov_b32_e32 v12, s52 -; GFX6-NEXT: v_mov_b32_e32 v13, s53 -; GFX6-NEXT: s_lshr_b32 s52, s4, 13 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v12, s54 +; GFX6-NEXT: v_mov_b32_e32 v13, s55 +; GFX6-NEXT: s_lshr_b32 s44, s4, 13 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v14, s38 -; GFX6-NEXT: v_mov_b32_e32 v15, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v14, s40 +; GFX6-NEXT: v_mov_b32_e32 v15, s41 +; GFX6-NEXT: s_lshr_b32 s40, s4, 10 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v16, s36 ; GFX6-NEXT: v_mov_b32_e32 v17, s37 ; GFX6-NEXT: s_lshr_b32 s36, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: s_lshr_b32 s38, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:304 @@ -7101,29 +7106,29 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: v_mov_b32_e32 v16, s24 ; GFX6-NEXT: v_mov_b32_e32 v17, s25 ; GFX6-NEXT: s_lshr_b32 s24, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 3 +; GFX6-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NEXT: s_lshr_b32 s20, s4, 3 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 @@ -7159,30 +7164,30 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s50 ; GFX6-NEXT: v_mov_b32_e32 v1, s51 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NEXT: v_mov_b32_e32 v2, s52 +; GFX6-NEXT: v_mov_b32_e32 v3, s53 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NEXT: v_mov_b32_e32 v1, s49 -; GFX6-NEXT: v_mov_b32_e32 v2, s46 -; GFX6-NEXT: v_mov_b32_e32 v3, s47 +; GFX6-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NEXT: v_mov_b32_e32 v1, s47 +; GFX6-NEXT: v_mov_b32_e32 v2, s48 +; GFX6-NEXT: v_mov_b32_e32 v3, s49 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s42 ; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s52 -; GFX6-NEXT: v_mov_b32_e32 v3, s53 +; GFX6-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NEXT: v_mov_b32_e32 v3, s45 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s38 -; GFX6-NEXT: v_mov_b32_e32 v1, s39 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 ; GFX6-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NEXT: v_mov_b32_e32 v3, s37 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: v_mov_b32_e32 v0, s38 +; GFX6-NEXT: v_mov_b32_e32 v1, s39 ; GFX6-NEXT: v_mov_b32_e32 v2, s30 ; GFX6-NEXT: v_mov_b32_e32 v3, s31 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 @@ -7201,8 +7206,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s24 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NEXT: v_mov_b32_e32 v3, s21 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_mov_b32_e32 v7, s5 @@ -7215,31 +7220,31 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_mov_b32 s13, 0 ; GFX8-NEXT: s_mov_b32 s11, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s14, s5, 22 -; GFX8-NEXT: s_lshr_b32 s16, s5, 23 -; GFX8-NEXT: s_lshr_b32 s20, s5, 20 -; GFX8-NEXT: s_lshr_b32 s22, s5, 21 -; GFX8-NEXT: s_lshr_b32 s24, s5, 18 -; GFX8-NEXT: s_lshr_b32 s26, s5, 19 -; GFX8-NEXT: s_lshr_b32 s28, s5, 16 -; GFX8-NEXT: s_lshr_b32 s30, s5, 17 -; GFX8-NEXT: s_lshr_b32 s34, s4, 22 -; GFX8-NEXT: s_lshr_b32 s36, s4, 23 -; GFX8-NEXT: s_lshr_b32 s38, s4, 20 -; GFX8-NEXT: s_lshr_b32 s40, s4, 21 -; GFX8-NEXT: s_lshr_b32 s42, s4, 18 -; GFX8-NEXT: s_lshr_b32 s44, s4, 19 -; GFX8-NEXT: s_lshr_b32 s46, s4, 16 -; GFX8-NEXT: s_lshr_b32 s48, s4, 17 -; GFX8-NEXT: s_mov_b32 s12, s5 -; GFX8-NEXT: s_lshr_b32 s10, s5, 24 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 +; GFX8-NEXT: s_lshr_b32 s16, s9, 22 +; GFX8-NEXT: s_lshr_b32 s18, s9, 23 +; GFX8-NEXT: s_lshr_b32 s20, s9, 20 +; GFX8-NEXT: s_lshr_b32 s22, s9, 21 +; GFX8-NEXT: s_lshr_b32 s24, s9, 18 +; GFX8-NEXT: s_lshr_b32 s26, s9, 19 +; GFX8-NEXT: s_lshr_b32 s28, s9, 16 +; GFX8-NEXT: s_lshr_b32 s30, s9, 17 +; GFX8-NEXT: s_lshr_b32 s34, s8, 22 +; GFX8-NEXT: s_lshr_b32 s36, s8, 23 +; GFX8-NEXT: s_lshr_b32 s38, s8, 20 +; GFX8-NEXT: s_lshr_b32 s40, s8, 21 +; GFX8-NEXT: s_lshr_b32 s42, s8, 18 +; GFX8-NEXT: s_lshr_b32 s44, s8, 19 +; GFX8-NEXT: s_lshr_b32 s46, s8, 16 +; GFX8-NEXT: s_lshr_b32 s48, s8, 17 +; GFX8-NEXT: s_mov_b32 s12, s9 +; GFX8-NEXT: s_lshr_b32 s10, s9, 24 +; GFX8-NEXT: s_lshr_b32 s6, s8, 24 ; GFX8-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[8:9], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[14:15], s[8:9], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 @@ -7254,165 +7259,165 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX8-NEXT: v_mov_b32_e32 v11, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x1b0 -; GFX8-NEXT: v_mov_b32_e32 v12, s15 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x1a0 -; GFX8-NEXT: v_mov_b32_e32 v13, s16 -; GFX8-NEXT: v_mov_b32_e32 v14, s17 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, s16 +; GFX8-NEXT: s_add_u32 s16, s0, 0x1b0 +; GFX8-NEXT: v_mov_b32_e32 v12, s17 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_mov_b32_e32 v13, s18 +; GFX8-NEXT: v_mov_b32_e32 v14, s19 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x1a0 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x190 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s20 ; GFX8-NEXT: v_mov_b32_e32 v12, s21 ; GFX8-NEXT: v_mov_b32_e32 v13, s22 ; GFX8-NEXT: v_mov_b32_e32 v14, s23 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x190 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x180 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s24 ; GFX8-NEXT: v_mov_b32_e32 v12, s25 ; GFX8-NEXT: v_mov_b32_e32 v13, s26 ; GFX8-NEXT: v_mov_b32_e32 v14, s27 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x180 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0xb0 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s28 ; GFX8-NEXT: v_mov_b32_e32 v12, s29 ; GFX8-NEXT: v_mov_b32_e32 v13, s30 ; GFX8-NEXT: v_mov_b32_e32 v14, s31 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0xb0 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0xa0 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s34 ; GFX8-NEXT: v_mov_b32_e32 v12, s35 ; GFX8-NEXT: v_mov_b32_e32 v13, s36 ; GFX8-NEXT: v_mov_b32_e32 v14, s37 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0xa0 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x90 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s38 ; GFX8-NEXT: v_mov_b32_e32 v12, s39 ; GFX8-NEXT: v_mov_b32_e32 v13, s40 ; GFX8-NEXT: v_mov_b32_e32 v14, s41 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x90 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x80 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_mov_b32_e32 v11, s42 ; GFX8-NEXT: v_mov_b32_e32 v12, s43 ; GFX8-NEXT: v_mov_b32_e32 v13, s44 ; GFX8-NEXT: v_mov_b32_e32 v14, s45 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x80 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x70 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s8 ; GFX8-NEXT: v_mov_b32_e32 v11, s46 ; GFX8-NEXT: v_mov_b32_e32 v12, s47 ; GFX8-NEXT: v_mov_b32_e32 v13, s48 ; GFX8-NEXT: v_mov_b32_e32 v14, s49 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x70 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v11, v10, 0, 1 ; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x60 +; GFX8-NEXT: v_bfe_i32 v11, v10, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x60 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[9:12] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 ; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 0x50 +; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 0x50 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v7, v6, 0, 1 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 64 +; GFX8-NEXT: v_bfe_i32 v7, v6, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 9, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 64 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[5:8] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 48 +; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, 6, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 48 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[3:6] -; GFX8-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 32 +; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s16 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s4 +; GFX8-NEXT: v_mov_b32_e32 v16, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 32 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s8 ; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[1:4] -; GFX8-NEXT: v_mov_b32_e32 v17, s15 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v16, s16 ; GFX8-NEXT: v_bfe_i32 v2, v13, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v16, s14 -; GFX8-NEXT: s_add_u32 s14, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s4 +; GFX8-NEXT: v_mov_b32_e32 v17, s17 +; GFX8-NEXT: s_add_u32 s16, s0, 16 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: s_addc_u32 s15, s1, 0 +; GFX8-NEXT: s_addc_u32 s17, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v18, s15 +; GFX8-NEXT: v_mov_b32_e32 v18, s17 ; GFX8-NEXT: v_bfe_i32 v2, v11, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v14, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v17, s14 +; GFX8-NEXT: v_mov_b32_e32 v17, s16 ; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v18, s1 ; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s5 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 14, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 15, s9 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 ; GFX8-NEXT: v_mov_b32_e32 v17, s0 ; GFX8-NEXT: s_add_u32 s14, s0, 0x170 ; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[0:3] @@ -7420,134 +7425,137 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_bfe_i32 v2, v10, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v9, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s9 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_mov_b32_e32 v10, s15 -; GFX8-NEXT: s_add_u32 s4, s0, 0x160 -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 12, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v8, 13, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v16, 7, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 5, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s5 +; GFX8-NEXT: s_add_u32 s8, s0, 0x160 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 10, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v6, 11, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v16, 7, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 5, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s9 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s5 -; GFX8-NEXT: v_lshrrev_b16_e64 v10, 1, s5 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v10, 1, s9 ; GFX8-NEXT: v_bfe_i32 v2, v8, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v8, s5 -; GFX8-NEXT: v_mov_b32_e32 v7, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x150 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x150 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NEXT: v_lshrrev_b16_e64 v7, 6, s10 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v6, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v5, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v6, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_add_u32 s4, s0, 0x140 +; GFX8-NEXT: v_mov_b32_e32 v6, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x140 ; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v15, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v4, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x130 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x130 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v16, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v13, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x120 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x120 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v12, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v11, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x110 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v9, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v14, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x100 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v10, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x1f0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_lshrrev_b16_e64 v7, 6, s10 ; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: s_add_u32 s8, s0, 0x1f0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s10 +; GFX8-NEXT: s_addc_u32 s9, s1, 0 ; GFX8-NEXT: v_bfe_i32 v2, v8, 0, 1 ; GFX8-NEXT: v_bfe_i32 v0, v7, 0, 1 -; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s8 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: v_lshrrev_b16_e64 v17, 4, s10 ; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s10 +; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s6 +; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s6 ; GFX8-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GFX8-NEXT: s_add_u32 s4, s0, 0x1e0 +; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s6 +; GFX8-NEXT: s_add_u32 s6, s0, 0x1e0 ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 1 -; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1 ; GFX8-NEXT: v_bfe_i32 v16, v17, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s10 ; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s10 ; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s10 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_add_u32 s4, s0, 0x1d0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_add_u32 s6, s0, 0x1d0 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1 ; GFX8-NEXT: v_bfe_i32 v18, v22, 0, 1 ; GFX8-NEXT: v_bfe_i32 v22, v21, 0, 1 -; GFX8-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_addc_u32 s7, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v23, 31, v22 ; GFX8-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v16, s4 ; GFX8-NEXT: s_add_u32 s4, s0, 0x1c0 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v17, s5 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GFX8-NEXT: v_mov_b32_e32 v16, s8 -; GFX8-NEXT: v_mov_b32_e32 v17, s9 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xf0 -; GFX8-NEXT: v_lshrrev_b16_e64 v15, 4, s6 ; GFX8-NEXT: v_bfe_i32 v14, v12, 0, 1 ; GFX8-NEXT: v_bfe_i32 v12, v13, 0, 1 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19] @@ -7558,15 +7566,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xe0 -; GFX8-NEXT: v_lshrrev_b16_e64 v9, 5, s6 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GFX8-NEXT: s_addc_u32 s5, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_add_u32 s4, s0, 0xd0 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 2, s6 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 3, s6 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GFX8-NEXT: s_addc_u32 s5, s1, 0 @@ -7606,30 +7611,30 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T43.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T76.XYZW, T42.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T55.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T75.XYZW, T40.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T39.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T74.XYZW, T38.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T36.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T35.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T72.XYZW, T34.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T33.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T71.XYZW, T32.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T31.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T70.XYZW, T30.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T29.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T69.XYZW, T28.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T27.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T75.XYZW, T39.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T38.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T74.XYZW, T37.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T36.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T66.XYZW, T35.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T34.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T72.XYZW, T33.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T32.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T71.XYZW, T31.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T30.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T70.XYZW, T29.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T28.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T69.XYZW, T27.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T26.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T68.XYZW, T25.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T24.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T23.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T23.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T22.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T67.XYZW, T21.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T20.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T19.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 38: -; EG-NEXT: VTX_READ_64 T26.XY, T26.X, 0, #1 +; EG-NEXT: VTX_READ_64 T40.XY, T26.X, 0, #1 ; EG-NEXT: ALU clause starting at 40: ; EG-NEXT: LSHR T19.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, @@ -7655,238 +7660,238 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 63: -; EG-NEXT: LSHR T27.X, T0.W, literal.x, +; EG-NEXT: LSHR T26.X, T0.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) -; EG-NEXT: LSHR T28.X, PV.W, literal.x, +; EG-NEXT: LSHR T27.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43) -; EG-NEXT: LSHR T29.X, PV.W, literal.x, +; EG-NEXT: LSHR T28.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43) -; EG-NEXT: LSHR T30.X, PV.W, literal.x, +; EG-NEXT: LSHR T29.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43) -; EG-NEXT: LSHR T31.X, PV.W, literal.x, +; EG-NEXT: LSHR T30.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43) -; EG-NEXT: LSHR T32.X, PV.W, literal.x, +; EG-NEXT: LSHR T31.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43) -; EG-NEXT: LSHR T33.X, PV.W, literal.x, +; EG-NEXT: LSHR T32.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43) -; EG-NEXT: LSHR T34.X, PV.W, literal.x, +; EG-NEXT: LSHR T33.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43) -; EG-NEXT: LSHR T35.X, PV.W, literal.x, +; EG-NEXT: LSHR T34.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 256(3.587324e-43) -; EG-NEXT: LSHR T36.X, PV.W, literal.x, +; EG-NEXT: LSHR T35.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 272(3.811532e-43) -; EG-NEXT: LSHR T37.X, PV.W, literal.x, +; EG-NEXT: LSHR T36.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 288(4.035740e-43) -; EG-NEXT: LSHR T38.X, PV.W, literal.x, +; EG-NEXT: LSHR T37.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 304(4.259947e-43) -; EG-NEXT: LSHR T39.X, PV.W, literal.x, +; EG-NEXT: LSHR T38.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 320(4.484155e-43) -; EG-NEXT: LSHR T40.X, PV.W, literal.x, +; EG-NEXT: LSHR T39.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 336(4.708363e-43) ; EG-NEXT: LSHR T41.X, PV.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 352(4.932571e-43) ; EG-NEXT: LSHR T42.X, PV.W, literal.x, -; EG-NEXT: LSHR T0.Z, T26.Y, literal.y, -; EG-NEXT: LSHR T0.W, T26.Y, literal.z, +; EG-NEXT: LSHR T0.Z, T40.Y, literal.y, +; EG-NEXT: LSHR T0.W, T40.Y, literal.z, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 28(3.923636e-44) ; EG-NEXT: 29(4.063766e-44), 368(5.156778e-43) ; EG-NEXT: LSHR T43.X, PS, literal.x, -; EG-NEXT: LSHR T0.Y, T26.Y, literal.y, -; EG-NEXT: LSHR T1.Z, T26.Y, literal.z, -; EG-NEXT: LSHR * T1.W, T26.Y, literal.w, +; EG-NEXT: LSHR T0.Y, T40.Y, literal.y, +; EG-NEXT: LSHR T1.Z, T40.Y, literal.z, +; EG-NEXT: LSHR * T1.W, T40.Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) ; EG-NEXT: 25(3.503246e-44), 20(2.802597e-44) ; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, ; EG-NEXT: 384(5.380986e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T44.X, PV.W, literal.x, -; EG-NEXT: LSHR T1.Y, T26.Y, literal.y, -; EG-NEXT: LSHR T2.Z, T26.Y, literal.z, -; EG-NEXT: LSHR * T2.W, T26.Y, literal.w, +; EG-NEXT: LSHR T1.Y, T40.Y, literal.y, +; EG-NEXT: LSHR T2.Z, T40.Y, literal.z, +; EG-NEXT: LSHR * T2.W, T40.Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 21(2.942727e-44) ; EG-NEXT: 16(2.242078e-44), 17(2.382207e-44) ; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x, ; EG-NEXT: 400(5.605194e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T45.X, PV.W, literal.x, -; EG-NEXT: LSHR T2.Y, T26.Y, literal.y, -; EG-NEXT: LSHR T3.Z, T26.Y, literal.z, -; EG-NEXT: LSHR * T3.W, T26.Y, literal.w, +; EG-NEXT: LSHR T2.Y, T40.Y, literal.y, +; EG-NEXT: LSHR T3.Z, T40.Y, literal.z, +; EG-NEXT: LSHR * T3.W, T40.Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 12(1.681558e-44) ; EG-NEXT: 13(1.821688e-44), 8(1.121039e-44) ; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x, ; EG-NEXT: 416(5.829402e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T46.X, PV.W, literal.x, -; EG-NEXT: LSHR T3.Y, T26.Y, literal.y, -; EG-NEXT: LSHR T4.Z, T26.Y, literal.z, -; EG-NEXT: LSHR * T4.W, T26.Y, literal.w, +; EG-NEXT: LSHR T3.Y, T40.Y, literal.y, +; EG-NEXT: LSHR T4.Z, T40.Y, literal.z, +; EG-NEXT: LSHR * T4.W, T40.Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) ; EG-NEXT: 4(5.605194e-45), 5(7.006492e-45) ; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.x, ; EG-NEXT: 432(6.053609e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T47.X, PV.W, literal.x, ; EG-NEXT: ADD_INT T4.Y, KC0[2].Y, literal.y, -; EG-NEXT: LSHR T5.Z, T26.Y, 1, -; EG-NEXT: LSHR T5.W, T26.X, literal.z, +; EG-NEXT: LSHR T5.Z, T40.Y, 1, +; EG-NEXT: LSHR T5.W, T40.X, literal.z, ; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.w, ; EG-NEXT: 2(2.802597e-45), 464(6.502025e-43) ; EG-NEXT: 28(3.923636e-44), 448(6.277817e-43) ; EG-NEXT: ALU clause starting at 153: ; EG-NEXT: LSHR T48.X, T6.W, literal.x, -; EG-NEXT: LSHR T5.Y, T26.X, literal.y, -; EG-NEXT: LSHR T6.Z, T26.X, literal.z, -; EG-NEXT: LSHR * T6.W, T26.X, literal.w, +; EG-NEXT: LSHR T5.Y, T40.X, literal.y, +; EG-NEXT: LSHR T6.Z, T40.X, literal.z, +; EG-NEXT: LSHR * T6.W, T40.X, literal.w, ; EG-NEXT: 2(2.802597e-45), 29(4.063766e-44) ; EG-NEXT: 24(3.363116e-44), 25(3.503246e-44) -; EG-NEXT: LSHR * T7.W, T26.X, literal.x, +; EG-NEXT: LSHR * T7.W, T40.X, literal.x, ; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T49.X, T26.X, 0.0, 1, -; EG-NEXT: LSHR T6.Y, T26.X, literal.x, -; EG-NEXT: ASHR T50.Z, T26.Y, literal.y, -; EG-NEXT: LSHR T8.W, T26.Y, literal.z, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.w, +; EG-NEXT: BFE_INT T49.X, T40.X, 0.0, 1, +; EG-NEXT: LSHR T6.Y, T40.X, literal.x, +; EG-NEXT: ASHR T50.Z, T40.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.z, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.w, ; EG-NEXT: 21(2.942727e-44), 31(4.344025e-44) ; EG-NEXT: 27(3.783506e-44), 30(4.203895e-44) ; EG-NEXT: BFE_INT T50.X, PS, 0.0, 1, -; EG-NEXT: LSHR T7.Y, T26.X, literal.x, +; EG-NEXT: LSHR T7.Y, T40.X, literal.x, ; EG-NEXT: BFE_INT T51.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.y, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.z, +; EG-NEXT: LSHR T8.W, T40.Y, literal.y, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.z, ; EG-NEXT: 16(2.242078e-44), 23(3.222986e-44) ; EG-NEXT: 26(3.643376e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T51.X, PS, 0.0, 1, ; EG-NEXT: MOV T50.Y, PV.X, ; EG-NEXT: BFE_INT T52.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 19(2.662467e-44), 22(3.082857e-44) ; EG-NEXT: BFE_INT T52.X, PS, 0.0, 1, ; EG-NEXT: MOV T51.Y, PV.X, ; EG-NEXT: BFE_INT T53.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 15(2.101948e-44), 18(2.522337e-44) ; EG-NEXT: BFE_INT T53.X, PS, 0.0, 1, ; EG-NEXT: MOV T52.Y, PV.X, ; EG-NEXT: BFE_INT T54.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 11(1.541428e-44), 14(1.961818e-44) ; EG-NEXT: BFE_INT T54.X, PS, 0.0, 1, ; EG-NEXT: MOV T53.Y, PV.X, ; EG-NEXT: BFE_INT T55.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 7(9.809089e-45), 10(1.401298e-44) ; EG-NEXT: BFE_INT T55.X, PS, 0.0, 1, ; EG-NEXT: MOV T54.Y, PV.X, ; EG-NEXT: BFE_INT T56.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.Y, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.Y, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 3(4.203895e-45), 6(8.407791e-45) ; EG-NEXT: BFE_INT T56.X, PS, 0.0, 1, ; EG-NEXT: MOV T55.Y, PV.X, ; EG-NEXT: BFE_INT T57.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T8.W, T26.X, literal.x, -; EG-NEXT: LSHR * T9.W, T26.Y, literal.y, +; EG-NEXT: LSHR T8.W, T40.X, literal.x, +; EG-NEXT: LSHR * T9.W, T40.Y, literal.y, ; EG-NEXT: 17(2.382207e-44), 2(2.802597e-45) ; EG-NEXT: BFE_INT T57.X, PS, 0.0, 1, ; EG-NEXT: MOV T56.Y, PV.X, -; EG-NEXT: ASHR T58.Z, T26.X, literal.x, -; EG-NEXT: LSHR T9.W, T26.X, literal.y, -; EG-NEXT: LSHR * T10.W, T26.X, literal.z, +; EG-NEXT: ASHR T58.Z, T40.X, literal.x, +; EG-NEXT: LSHR T9.W, T40.X, literal.y, +; EG-NEXT: LSHR * T10.W, T40.X, literal.z, ; EG-NEXT: 31(4.344025e-44), 27(3.783506e-44) ; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) ; EG-NEXT: BFE_INT T58.X, PS, 0.0, 1, ; EG-NEXT: MOV T57.Y, PV.X, ; EG-NEXT: BFE_INT T59.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 23(3.222986e-44), 26(3.643376e-44) ; EG-NEXT: BFE_INT T59.X, PS, 0.0, 1, ; EG-NEXT: MOV T58.Y, PV.X, ; EG-NEXT: BFE_INT T60.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 19(2.662467e-44), 22(3.082857e-44) ; EG-NEXT: BFE_INT T60.X, PS, 0.0, 1, ; EG-NEXT: MOV T59.Y, PV.X, ; EG-NEXT: BFE_INT T61.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 15(2.101948e-44), 18(2.522337e-44) ; EG-NEXT: BFE_INT T61.X, PS, 0.0, 1, ; EG-NEXT: MOV T60.Y, PV.X, ; EG-NEXT: BFE_INT T62.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 11(1.541428e-44), 14(1.961818e-44) ; EG-NEXT: BFE_INT T62.X, PS, 0.0, 1, ; EG-NEXT: MOV T61.Y, PV.X, ; EG-NEXT: BFE_INT T63.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 7(9.809089e-45), 10(1.401298e-44) ; EG-NEXT: BFE_INT T63.X, PS, 0.0, 1, ; EG-NEXT: MOV T62.Y, PV.X, ; EG-NEXT: BFE_INT T64.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR * T9.W, T26.X, literal.x, +; EG-NEXT: LSHR * T9.W, T40.X, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 253: -; EG-NEXT: LSHR * T10.W, T26.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.x, ; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) ; EG-NEXT: BFE_INT T64.X, PV.W, 0.0, 1, ; EG-NEXT: MOV T63.Y, T63.X, ; EG-NEXT: BFE_INT T65.Z, T9.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, 1, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T10.W, T26.X, literal.x, +; EG-NEXT: LSHR T9.W, T40.X, 1, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T10.W, T40.X, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: BFE_INT T65.X, PS, 0.0, 1, ; EG-NEXT: MOV T64.Y, PV.X, ; EG-NEXT: BFE_INT T49.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T9.W, T26.X, literal.x, -; EG-NEXT: LSHR * T10.W, T26.X, literal.y, +; EG-NEXT: LSHR T9.W, T40.X, literal.x, +; EG-NEXT: LSHR * T10.W, T40.X, literal.y, ; EG-NEXT: 12(1.681558e-44), 5(7.006492e-45) -; EG-NEXT: BFE_INT T66.X, T26.Y, 0.0, 1, +; EG-NEXT: BFE_INT T66.X, T40.Y, 0.0, 1, ; EG-NEXT: MOV T65.Y, PV.X, ; EG-NEXT: BFE_INT T67.Z, PS, 0.0, 1, -; EG-NEXT: LSHR T10.W, T26.X, literal.x, -; EG-NEXT: LSHR * T11.W, T26.X, literal.y, +; EG-NEXT: LSHR T10.W, T40.X, literal.x, +; EG-NEXT: LSHR * T11.W, T40.X, literal.y, ; EG-NEXT: 9(1.261169e-44), 4(5.605194e-45) ; EG-NEXT: BFE_INT T67.X, PS, 0.0, 1, ; EG-NEXT: MOV T49.Y, T49.X, -; EG-NEXT: BFE_INT T26.Z, PV.W, 0.0, 1, -; EG-NEXT: LSHR T10.W, T26.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR * T11.W, T26.X, literal.y, +; EG-NEXT: BFE_INT T40.Z, PV.W, 0.0, 1, +; EG-NEXT: LSHR T10.W, T40.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: LSHR * T11.W, T40.X, literal.y, ; EG-NEXT: 13(1.821688e-44), 8(1.121039e-44) -; EG-NEXT: BFE_INT T26.X, PS, 0.0, 1, +; EG-NEXT: BFE_INT T40.X, PS, 0.0, 1, ; EG-NEXT: MOV T67.Y, PV.X, ; EG-NEXT: BFE_INT T68.Z, PV.W, 0.0, 1, ; EG-NEXT: MOV T49.W, T49.Z, ; EG-NEXT: MOV * T65.W, T65.Z, ; EG-NEXT: BFE_INT T68.X, T9.W, 0.0, 1, -; EG-NEXT: MOV T26.Y, PV.X, +; EG-NEXT: MOV T40.Y, PV.X, ; EG-NEXT: BFE_INT T69.Z, T8.W, 0.0, 1, BS:VEC_120/SCL_212 ; EG-NEXT: MOV T67.W, T67.Z, ; EG-NEXT: MOV * T64.W, T64.Z, ; EG-NEXT: BFE_INT T69.X, T7.Y, 0.0, 1, ; EG-NEXT: MOV T68.Y, PV.X, ; EG-NEXT: BFE_INT T70.Z, T6.Y, 0.0, 1, BS:VEC_120/SCL_212 -; EG-NEXT: MOV T26.W, T26.Z, +; EG-NEXT: MOV T40.W, T40.Z, ; EG-NEXT: MOV * T63.W, T63.Z, ; EG-NEXT: BFE_INT T70.X, T7.W, 0.0, 1, ; EG-NEXT: MOV T69.Y, PV.X, diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 7d98459704918..bee3d455187ca 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2976,22 +2976,22 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s0, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s3, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s39, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s51, s1, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s0, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s3, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s53, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s39, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s40, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s43, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff @@ -3086,50 +3086,50 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -3152,10 +3152,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s40, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s41, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s43, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s45, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s47, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff @@ -3163,10 +3163,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff -; GCN-HSA-NEXT: s_and_b32 s44, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s45, s9, 0xffff -; GCN-HSA-NEXT: s_and_b32 s46, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s47, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s40, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s42, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s44, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s46, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s48, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s49, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s50, s12, 0xffff @@ -3284,18 +3284,18 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s45 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -3303,23 +3303,23 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 @@ -3585,42 +3585,42 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T56.XYZW, T58.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T52.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T53.XYZW, T55.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T39.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T48.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T46.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T41.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T43.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T36.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T38.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 -; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 48, #1 -; EG-NEXT: VTX_READ_128 T40.XYZW, T35.X, 32, #1 -; EG-NEXT: VTX_READ_128 T41.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T38.XYZW, T37.X, 0, #1 +; EG-NEXT: VTX_READ_128 T39.XYZW, T37.X, 48, #1 +; EG-NEXT: VTX_READ_128 T40.XYZW, T37.X, 32, #1 +; EG-NEXT: VTX_READ_128 T41.XYZW, T37.X, 16, #1 ; EG-NEXT: Fetch clause starting at 30: -; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1 -; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1 -; EG-NEXT: VTX_READ_128 T51.XYZW, T35.X, 80, #1 -; EG-NEXT: VTX_READ_128 T52.XYZW, T35.X, 64, #1 +; EG-NEXT: VTX_READ_128 T49.XYZW, T37.X, 112, #1 +; EG-NEXT: VTX_READ_128 T50.XYZW, T37.X, 96, #1 +; EG-NEXT: VTX_READ_128 T51.XYZW, T37.X, 80, #1 +; EG-NEXT: VTX_READ_128 T52.XYZW, T37.X, 64, #1 ; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: MOV * T37.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 39: -; EG-NEXT: LSHR * T37.W, T36.Y, literal.x, +; EG-NEXT: LSHR * T35.W, T38.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T37.Z, T36.Y, literal.x, +; EG-NEXT: AND_INT * T35.Z, T38.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T37.Y, T36.X, literal.x, -; EG-NEXT: LSHR * T38.W, T36.W, literal.x, +; EG-NEXT: LSHR T35.Y, T38.X, literal.x, +; EG-NEXT: LSHR * T36.W, T38.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T37.X, T36.X, literal.x, -; EG-NEXT: AND_INT T38.Z, T36.W, literal.x, -; EG-NEXT: LSHR * T36.X, KC0[2].Y, literal.y, +; EG-NEXT: AND_INT T35.X, T38.X, literal.x, +; EG-NEXT: AND_INT T36.Z, T38.W, literal.x, +; EG-NEXT: LSHR * T38.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) -; EG-NEXT: LSHR T38.Y, T36.Z, literal.x, +; EG-NEXT: LSHR T36.Y, T38.Z, literal.x, ; EG-NEXT: LSHR * T42.W, T41.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T38.X, T36.Z, literal.x, +; EG-NEXT: AND_INT T36.X, T38.Z, literal.x, ; EG-NEXT: AND_INT T42.Z, T41.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) @@ -3657,16 +3657,16 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: LSHR * T35.W, T39.Y, literal.y, +; EG-NEXT: LSHR * T37.W, T39.Y, literal.y, ; EG-NEXT: 80(1.121039e-43), 16(2.242078e-44) ; EG-NEXT: LSHR T48.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T35.Z, T39.Y, literal.y, +; EG-NEXT: AND_INT * T37.Z, T39.Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: ALU clause starting at 95: -; EG-NEXT: LSHR T35.Y, T39.X, literal.x, +; EG-NEXT: LSHR T37.Y, T39.X, literal.x, ; EG-NEXT: LSHR * T53.W, T39.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T35.X, T39.X, literal.x, +; EG-NEXT: AND_INT T37.X, T39.X, literal.x, ; EG-NEXT: AND_INT T53.Z, T39.W, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43) @@ -3762,167 +3762,167 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; GCN-NOHSA-SI: ; %bb.0: ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x0 -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s0, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s35, s1 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s38, s0 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s2, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s41, s3 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s42, s2 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s4, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s6, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s9, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s48, s8, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s9, s9 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s11, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s50, s10, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s51, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s12, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s53, s15, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s54, s14, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s17, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s16, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s17, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s16, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s17, s17 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s16, s16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s19, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s18, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s19, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s18, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s19, s19 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s21, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s20, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s21, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s20, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s21, s21 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s20, s20 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s23, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s22, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s42, s22, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s23, s23 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s22, s22 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s24, 16 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s25 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s25, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s24, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s25, s25 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s24, s24 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s27, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s26, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s27, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s26, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s27, s27 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s29, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s28, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s29, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s48, s28, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s29, s29 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s28, s28 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s69, s31, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s30, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s49, s31, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s50, s30, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s31, s31 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s30, s30 -; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s26, s26 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s25, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s51, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s52, s0, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s53, s1 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s54, s0 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s2, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s57, s3 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s58, s2 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s4, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s62, s6, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s7, s7 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s63, s8, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s64, s9 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s65, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s66, s10, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s11, s11 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s67, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s68, s12, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s13, s13 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s69, s15, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s70, s14, 16 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s15, s15 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s14, s14 +; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s9, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s37 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s63 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s57 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s43 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -5640,61 +5640,61 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: @@ -6920,133 +6920,133 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s4, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s0, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[68:69], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[60:61], s[0:1], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[8:9], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[12:13], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[14:15], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[6:7], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s63 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s69 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s67 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s74 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s75 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 @@ -7066,21 +7066,21 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s42, s15 -; GCN-HSA-NEXT: s_mov_b32 s44, s13 -; GCN-HSA-NEXT: s_mov_b32 s46, s11 -; GCN-HSA-NEXT: s_mov_b32 s48, s9 -; GCN-HSA-NEXT: s_mov_b32 s50, s7 -; GCN-HSA-NEXT: s_mov_b32 s52, s5 -; GCN-HSA-NEXT: s_mov_b32 s54, s3 -; GCN-HSA-NEXT: s_mov_b32 s56, s1 -; GCN-HSA-NEXT: s_lshr_b32 s58, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s60, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s70, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s72, s0, 16 +; GCN-HSA-NEXT: s_mov_b32 s48, s13 +; GCN-HSA-NEXT: s_mov_b32 s50, s11 +; GCN-HSA-NEXT: s_mov_b32 s52, s9 +; GCN-HSA-NEXT: s_mov_b32 s54, s7 +; GCN-HSA-NEXT: s_mov_b32 s56, s5 +; GCN-HSA-NEXT: s_mov_b32 s46, s3 +; GCN-HSA-NEXT: s_mov_b32 s58, s1 +; GCN-HSA-NEXT: s_lshr_b32 s60, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s70, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s72, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s74, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 @@ -7094,7 +7094,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[4:5], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[44:45], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 @@ -7102,44 +7102,44 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 -; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[72:73], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[74:75], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[72:73], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[70:71], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s56, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s57, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 +; GCN-HSA-NEXT: s_add_u32 s58, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s59, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s49 +; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 +; GCN-HSA-NEXT: s_add_u32 s48, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 +; GCN-HSA-NEXT: s_add_u32 s48, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s49, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s44 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 ; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s59 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 @@ -7148,34 +7148,34 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 ; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s44 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 ; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index d2ea7a95473e0..8791384101218 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -746,29 +746,29 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry -; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 -; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 -; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11 +; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x8 +; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 +; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s2 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s9 -; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 @@ -779,29 +779,29 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; ; GFX8-NOHSA-LABEL: constant_load_v11i32: ; GFX8-NOHSA: ; %bb.0: ; %entry -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20 -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 -; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 16 -; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 @@ -2011,37 +2011,39 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; ; GFX8-NOHSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX8-NOHSA: ; %bb.0: -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s8, 48 +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 7c3a5db46a8d8..66fc322e5e04b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -2808,33 +2808,33 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s2, 24 ; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s2, 0x80008 ; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s3, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s4, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s5, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s6, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s7, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s8, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s8, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s37, s9, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s38, s9, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s39, s10, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s40, s10, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s41, s11, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s42, s11, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s43, s12, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s44, s12, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s45, s13, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s46, s13, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s47, s14, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s48, s14, 0x80008 -; GFX6-NOHSA-NEXT: s_lshr_b32 s49, s15, 24 -; GFX6-NOHSA-NEXT: s_bfe_u32 s50, s15, 0x80008 -; GFX6-NOHSA-NEXT: s_and_b32 s51, s0, 0xff -; GFX6-NOHSA-NEXT: s_bfe_u32 s52, s0, 0x80010 +; GFX6-NOHSA-NEXT: s_bfe_u32 s26, s3, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s4, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s28, s4, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s30, s5, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s31, s6, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s33, s6, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s35, s7, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s8, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s37, s8, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s9, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s39, s9, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s10, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s41, s10, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s11, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s43, s11, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s12, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s45, s12, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s13, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s47, s13, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s14, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s49, s14, 0x80008 +; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s15, 24 +; GFX6-NOHSA-NEXT: s_bfe_u32 s51, s15, 0x80008 +; GFX6-NOHSA-NEXT: s_and_b32 s52, s0, 0xff +; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s0, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s53, s1, 0xff ; GFX6-NOHSA-NEXT: s_bfe_u32 s54, s1, 0x80010 ; GFX6-NOHSA-NEXT: s_and_b32 s55, s2, 0xff @@ -2870,76 +2870,76 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s70 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s51 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s50 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s69 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s49 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s68 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s47 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s46 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s43 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s44 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(3) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s43 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s42 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s41 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s40 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s38 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s36 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s35 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s34 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s33 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s31 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s30 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s28 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s27 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s24 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 @@ -2956,9 +2956,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm @@ -2981,42 +2981,42 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s29, s4, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s31, s5, 24 ; GFX7-HSA-NEXT: s_bfe_u32 s33, s5, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s6, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s35, s6, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s7, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s37, s7, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s8, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s39, s8, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s9, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s41, s9, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s10, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s43, s10, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s45, s11, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s46, s12, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s47, s12, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s13, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s49, s13, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s14, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s51, s14, 0x80008 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s15, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s53, s15, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s35, s6, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s36, s6, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s39, s7, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s41, s8, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s42, s8, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s43, s9, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s44, s9, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s45, s10, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s46, s10, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s47, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s48, s11, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s49, s12, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s50, s12, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s51, s13, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s52, s13, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s53, s14, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s54, s14, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s55, s15, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s56, s15, 0x80008 ; GFX7-HSA-NEXT: s_and_b32 s24, s0, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s27, s1, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s30, s2, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s2, s2, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s54, s3, 0xff +; GFX7-HSA-NEXT: s_and_b32 s34, s3, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s55, s4, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s56, s4, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s57, s5, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s58, s5, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s59, s6, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s6, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s60, s7, 0xff -; GFX7-HSA-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s37, s4, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s40, s5, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s57, s6, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s58, s6, 0x80010 +; GFX7-HSA-NEXT: s_and_b32 s59, s7, 0xff +; GFX7-HSA-NEXT: s_bfe_u32 s60, s7, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s61, s8, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s8, s8, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s62, s9, 0xff @@ -3033,118 +3033,119 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_u32 s14, s14, 0x80010 ; GFX7-HSA-NEXT: s_and_b32 s68, s15, 0xff ; GFX7-HSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x90 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s54 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s50 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s53 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s52 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 ; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s48 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s42 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s38 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s46 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s44 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s16, 64 ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s37 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s16, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 @@ -3214,8 +3215,8 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s28, s4, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s4 -; GFX8-NOHSA-NEXT: s_bfe_u32 s41, s4, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s4, s5, 0xff +; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX8-NOHSA-NEXT: s_and_b32 s41, s5, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s42, s5, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s43, s6, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s44, s6, 0x80010 @@ -3327,28 +3328,27 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s16, 0x50 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s44 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 64 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s4 ; GFX8-NOHSA-NEXT: s_add_u32 s4, s16, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s41 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s25 ; GFX8-NOHSA-NEXT: s_addc_u32 s5, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] @@ -3398,96 +3398,96 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T33.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T35.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T31.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T20.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T22.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T27.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T21.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 16, #1 -; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 0, #1 +; EG-NEXT: VTX_READ_128 T22.XYZW, T21.X, 16, #1 +; EG-NEXT: VTX_READ_128 T23.XYZW, T21.X, 0, #1 ; EG-NEXT: Fetch clause starting at 26: -; EG-NEXT: VTX_READ_128 T32.XYZW, T19.X, 48, #1 -; EG-NEXT: VTX_READ_128 T33.XYZW, T19.X, 32, #1 +; EG-NEXT: VTX_READ_128 T32.XYZW, T21.X, 48, #1 +; EG-NEXT: VTX_READ_128 T33.XYZW, T21.X, 32, #1 ; EG-NEXT: ALU clause starting at 30: -; EG-NEXT: MOV * T19.X, KC0[2].Z, +; EG-NEXT: MOV * T21.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 31: ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T22.Z, T21.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT * T19.Z, T23.X, literal.x, PV.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T22.Y, T21.X, literal.x, T0.W, -; EG-NEXT: BFE_UINT T23.Z, T21.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T22.W, T21.X, literal.z, +; EG-NEXT: BFE_UINT T19.Y, T23.X, literal.x, T0.W, +; EG-NEXT: BFE_UINT T20.Z, T23.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T19.W, T23.X, literal.z, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T22.X, T21.X, literal.x, -; EG-NEXT: BFE_UINT T23.Y, T21.Y, literal.y, T0.W, -; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.z, +; EG-NEXT: AND_INT T19.X, T23.X, literal.x, +; EG-NEXT: BFE_UINT T20.Y, T23.Y, literal.y, T0.W, +; EG-NEXT: LSHR * T23.X, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T24.Z, T21.Z, literal.x, T0.W, -; EG-NEXT: LSHR * T23.W, T21.Y, literal.y, +; EG-NEXT: BFE_UINT T24.Z, T23.Z, literal.x, T0.W, +; EG-NEXT: LSHR * T20.W, T23.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T23.X, T21.Y, literal.x, -; EG-NEXT: BFE_UINT T24.Y, T21.Z, literal.y, T0.W, +; EG-NEXT: AND_INT T20.X, T23.Y, literal.x, +; EG-NEXT: BFE_UINT T24.Y, T23.Z, literal.y, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T25.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T26.Z, T21.W, literal.y, T0.W, -; EG-NEXT: LSHR T24.W, T21.Z, literal.z, -; EG-NEXT: AND_INT * T24.X, T21.Z, literal.w, +; EG-NEXT: BFE_UINT T26.Z, T23.W, literal.y, T0.W, +; EG-NEXT: LSHR T24.W, T23.Z, literal.z, +; EG-NEXT: AND_INT * T24.X, T23.Z, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T26.Y, T21.W, literal.x, T0.W, +; EG-NEXT: BFE_UINT T26.Y, T23.W, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) ; EG-NEXT: LSHR T27.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T28.Z, T20.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T26.W, T21.W, literal.z, -; EG-NEXT: AND_INT * T26.X, T21.W, literal.w, +; EG-NEXT: BFE_UINT T28.Z, T22.X, literal.y, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR T26.W, T23.W, literal.z, +; EG-NEXT: AND_INT * T26.X, T23.W, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T28.Y, T20.X, literal.x, T0.W, +; EG-NEXT: BFE_UINT T28.Y, T22.X, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) ; EG-NEXT: LSHR T29.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT T30.Z, T20.Y, literal.y, T0.W, -; EG-NEXT: LSHR T28.W, T20.X, literal.z, -; EG-NEXT: AND_INT * T28.X, T20.X, literal.w, +; EG-NEXT: BFE_UINT T30.Z, T22.Y, literal.y, T0.W, +; EG-NEXT: LSHR T28.W, T22.X, literal.z, +; EG-NEXT: AND_INT * T28.X, T22.X, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) -; EG-NEXT: BFE_UINT T30.Y, T20.Y, literal.x, T0.W, +; EG-NEXT: BFE_UINT T30.Y, T22.Y, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44) -; EG-NEXT: LSHR T20.X, PV.W, literal.x, -; EG-NEXT: LSHR T30.W, T20.Y, literal.y, -; EG-NEXT: AND_INT * T30.X, T20.Y, literal.z, +; EG-NEXT: LSHR T22.X, PV.W, literal.x, +; EG-NEXT: LSHR T30.W, T22.Y, literal.y, +; EG-NEXT: AND_INT * T30.X, T22.Y, literal.z, ; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44) ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T19.Z, T20.Z, literal.x, T0.W, +; EG-NEXT: BFE_UINT T21.Z, T22.Z, literal.x, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43) ; EG-NEXT: LSHR T31.X, PV.W, literal.x, -; EG-NEXT: BFE_UINT * T19.Y, T20.Z, literal.y, T0.W, +; EG-NEXT: BFE_UINT * T21.Y, T22.Z, literal.y, T0.W, ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; EG-NEXT: ALU clause starting at 91: -; EG-NEXT: BFE_UINT T34.Z, T20.W, literal.x, T0.W, -; EG-NEXT: LSHR * T19.W, T20.Z, literal.y, +; EG-NEXT: BFE_UINT T34.Z, T22.W, literal.x, T0.W, +; EG-NEXT: LSHR * T21.W, T22.Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) -; EG-NEXT: AND_INT T19.X, T20.Z, literal.x, -; EG-NEXT: BFE_UINT T34.Y, T20.W, literal.y, T0.W, +; EG-NEXT: AND_INT T21.X, T22.Z, literal.x, +; EG-NEXT: BFE_UINT T34.Y, T22.W, literal.y, T0.W, ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) ; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) ; EG-NEXT: LSHR T35.X, PV.W, literal.x, ; EG-NEXT: BFE_UINT T36.Z, T33.X, literal.y, T0.W, BS:VEC_021/SCL_122 -; EG-NEXT: LSHR T34.W, T20.W, literal.z, -; EG-NEXT: AND_INT * T34.X, T20.W, literal.w, +; EG-NEXT: LSHR T34.W, T22.W, literal.z, +; EG-NEXT: AND_INT * T34.X, T22.W, literal.w, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43) ; EG-NEXT: BFE_UINT T36.Y, T33.X, literal.x, T0.W, @@ -3767,15 +3767,15 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s37, s6, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s38, s6, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s39, s6, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s43, s7, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s45, s8, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s46, s8, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s47, s8, 0x80008 -; GFX7-HSA-NEXT: s_ashr_i32 s48, s9, 24 -; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80010 -; GFX7-HSA-NEXT: s_bfe_i32 s50, s9, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s40, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s41, s7, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s42, s7, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s43, s8, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s44, s8, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s45, s8, 0x80008 +; GFX7-HSA-NEXT: s_ashr_i32 s47, s9, 24 +; GFX7-HSA-NEXT: s_bfe_i32 s48, s9, 0x80010 +; GFX7-HSA-NEXT: s_bfe_i32 s49, s9, 0x80008 ; GFX7-HSA-NEXT: s_ashr_i32 s51, s10, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s52, s10, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s53, s10, 0x80008 @@ -3794,51 +3794,49 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_ashr_i32 s66, s15, 24 ; GFX7-HSA-NEXT: s_bfe_i32 s67, s15, 0x80010 ; GFX7-HSA-NEXT: s_bfe_i32 s68, s15, 0x80008 -; GFX7-HSA-NEXT: s_sext_i32_i8 s40, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xf0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s44, s7 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xe0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xb0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s7 +; GFX7-HSA-NEXT: s_sext_i32_i8 s46, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xf0 +; GFX7-HSA-NEXT: s_sext_i32_i8 s50, s9 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xe0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xb0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0xa0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s9 ; GFX7-HSA-NEXT: s_sext_i32_i8 s14, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x90 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s65 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s64 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s63 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x80 -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: s_sext_i32_i8 s8, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX7-HSA-NEXT: s_sext_i32_i8 s11, s11 ; GFX7-HSA-NEXT: s_sext_i32_i8 s12, s12 ; GFX7-HSA-NEXT: s_sext_i32_i8 s13, s13 ; GFX7-HSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s7 -; GFX7-HSA-NEXT: s_sext_i32_i8 s9, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v35, s9 ; GFX7-HSA-NEXT: s_sext_i32_i8 s10, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s68 @@ -3856,45 +3854,47 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s55 ; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s45 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v34, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s43 +; GFX7-HSA-NEXT: s_add_u32 s8, s16, 0x70 +; GFX7-HSA-NEXT: s_sext_i32_i8 s7, s7 ; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s49 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s47 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GFX7-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: s_sext_i32_i8 s6, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s40 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: s_add_u32 s6, s16, 0x50 -; GFX7-HSA-NEXT: s_sext_i32_i8 s5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 @@ -3960,7 +3960,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v17, 8, s14 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v18, 8, s14 ; GFX8-NOHSA-NEXT: s_ashr_i32 s18, s0, 24 ; GFX8-NOHSA-NEXT: s_bfe_i32 s19, s0, 0x80010 ; GFX8-NOHSA-NEXT: s_ashr_i32 s20, s1, 24 @@ -3996,54 +3996,54 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_bfe_i32 s50, s15, 0x80010 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s15 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s15, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s14 ; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s15 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 ; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xe0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v5, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s50 +; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v5, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s50 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s15 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v19, 8, s12 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v17, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s14 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v20, 8, s12 +; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v18, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s47 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s14, s12 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s12, s13 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v18, 8, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s12 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v19, 8, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xd0 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v18, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s45 +; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v19, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s45 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s12 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12 ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xb0 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v19, 0, 8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s43 +; GFX8-NOHSA-NEXT: v_bfe_i32 v13, v20, 0, 8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s43 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 -; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v20, 8, s11 +; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s11 ; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s11 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s13 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s8 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4 -; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v20, 0, 8 +; GFX8-NOHSA-NEXT: v_bfe_i32 v12, v11, 0, 8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s42 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s41 @@ -5848,13 +5848,13 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 24 ; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[36:37], s[4:5], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[40:41], s[6:7], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -5863,14 +5863,14 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s12 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s37 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s24 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s25 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 @@ -5899,11 +5899,11 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s5 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -6820,71 +6820,72 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s7, 8 ; GFX6-NOHSA-NEXT: s_mov_b32 s50, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s48, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s3, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s44, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s3, 8 ; GFX6-NOHSA-NEXT: s_mov_b32 s40, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[48:49], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s54, s1 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s1, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[50:51], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[44:45], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s62, s1 ; GFX6-NOHSA-NEXT: s_lshr_b32 s56, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s58, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s60, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[2:3], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[64:65], s[4:5], 56 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s64, s0, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s66, s0, 8 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[50:51], s[0:1], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 56 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s66 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s67 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s64 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s65 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s5 -; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:240 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[54:55], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[60:61], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[58:59], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[50:51], s[56:57], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i64 s[54:55], s[2:3], 56 +; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 +; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s70 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s71 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s68 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s69 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s5 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[62:63], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[48:49], s[40:41], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[42:43], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 @@ -6895,78 +6896,81 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:224 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:208 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s17 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:192 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:192 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:176 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:160 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[8:11], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s50 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s51 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s27 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s45 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s29 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s30 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:64 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s45 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s46 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s47 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s50 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s51 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:16 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s41 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -6976,17 +6980,17 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s42, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s46, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s54, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s50, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s62, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 ; GFX7-HSA-NEXT: s_mov_b32 s34, s3 @@ -6994,29 +6998,29 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s62, s1, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 8 ; GFX7-HSA-NEXT: s_mov_b32 s16, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s70, s0, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[20:21], s[2:3], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[38:39], s[4:5], 56 +; GFX7-HSA-NEXT: s_ashr_i64 s[42:43], s[4:5], 56 ; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56 -; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[6:7], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[70:71], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[64:65], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 @@ -7024,6 +7028,10 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 @@ -7031,73 +7039,69 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s40 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s41 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s41 -; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xc0 +; GFX7-HSA-NEXT: s_add_u32 s64, s8, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s65, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s48 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s49 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s49 +; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s46 +; GFX7-HSA-NEXT: s_add_u32 s46, s8, 0xc0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s47 +; GFX7-HSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s42 +; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s43 +; GFX7-HSA-NEXT: s_add_u32 s42, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s65 +; GFX7-HSA-NEXT: s_addc_u32 s43, s9, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 ; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s39 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xa0 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s39 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s46 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s56 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 ; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s51 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s42 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s40 ; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s57 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s41 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s25 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] @@ -9626,25 +9630,25 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: ALU 103, @16, KC0[], KC1[] ; EG-NEXT: ALU 104, @120, KC0[], KC1[] ; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T42.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T41.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 10: -; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 ; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: ; EG-NEXT: MOV * T0.Y, T16.X, ; EG-NEXT: MOV * T35.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 16: -; EG-NEXT: AND_INT T0.W, T36.X, literal.x, +; EG-NEXT: AND_INT T0.W, T37.X, literal.x, ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, ; EG-NEXT: 255(3.573311e-43), -65536(nan) ; EG-NEXT: OR_INT * T0.W, PS, PV.W, ; EG-NEXT: MOV * T16.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T0.W, T36.X, literal.x, +; EG-NEXT: LSHL * T0.W, T37.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, @@ -9654,27 +9658,27 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: MOV T0.Y, T17.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T1.W, T36.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W, ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, ; EG-NEXT: 16(2.242078e-44), -65536(nan) ; EG-NEXT: OR_INT * T1.W, PS, PV.W, ; EG-NEXT: MOV * T17.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T36.X, literal.x, +; EG-NEXT: LSHR * T1.W, T37.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.Y, PV.W, PS, +; EG-NEXT: OR_INT * T36.Y, PV.W, PS, ; EG-NEXT: MOV T17.X, PV.Y, ; EG-NEXT: MOV * T0.Y, T12.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T36.Y, literal.y, +; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T12.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T36.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T37.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9682,28 +9686,28 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T12.X, PV.W, ; EG-NEXT: MOV T0.Y, T13.X, -; EG-NEXT: BFE_UINT * T1.W, T36.Y, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T13.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T36.Y, literal.x, +; EG-NEXT: LSHR * T1.W, T37.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T37.W, PV.W, PS, +; EG-NEXT: OR_INT * T36.W, PV.W, PS, ; EG-NEXT: MOV T13.X, PV.W, ; EG-NEXT: MOV * T0.Y, T8.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T36.Z, literal.y, +; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T8.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T36.Z, literal.x, +; EG-NEXT: LSHL * T1.W, T37.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9711,28 +9715,28 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T8.X, PV.W, ; EG-NEXT: MOV T0.Y, T9.X, -; EG-NEXT: BFE_UINT * T1.W, T36.Z, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, ; EG-NEXT: -65536(nan), 0(0.000000e+00) ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T9.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T36.Z, literal.x, +; EG-NEXT: LSHR * T1.W, T37.Z, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.Y, PV.W, PS, +; EG-NEXT: OR_INT * T37.Y, PV.W, PS, ; EG-NEXT: MOV T9.X, PV.Y, ; EG-NEXT: MOV * T0.Y, T4.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, -; EG-NEXT: AND_INT * T2.W, T36.W, literal.y, +; EG-NEXT: AND_INT * T2.W, T37.W, literal.y, ; EG-NEXT: -65536(nan), 255(3.573311e-43) ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV * T4.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHL * T1.W, T36.W, literal.x, +; EG-NEXT: LSHL * T1.W, T37.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, @@ -9740,7 +9744,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, PS, ; EG-NEXT: MOV T4.X, PV.W, ; EG-NEXT: MOV T0.Y, T5.X, -; EG-NEXT: BFE_UINT * T1.W, T36.W, literal.x, T0.W, +; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 120: ; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x, @@ -9748,12 +9752,12 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, ; EG-NEXT: MOV * T5.X, PV.W, ; EG-NEXT: MOV T0.Y, PV.X, -; EG-NEXT: LSHR * T1.W, T36.W, literal.x, +; EG-NEXT: LSHR * T1.W, T37.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) -; EG-NEXT: OR_INT * T36.W, PV.W, PS, +; EG-NEXT: OR_INT * T37.W, PV.W, PS, ; EG-NEXT: MOV T5.X, PV.W, ; EG-NEXT: MOV * T0.Y, T32.X, ; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, @@ -9883,10 +9887,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: MOV T21.X, PV.W, -; EG-NEXT: MOV * T37.X, T16.X, -; EG-NEXT: MOV * T37.Z, T12.X, -; EG-NEXT: MOV T36.X, T8.X, -; EG-NEXT: MOV T36.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T36.X, T16.X, +; EG-NEXT: MOV * T36.Z, T12.X, +; EG-NEXT: MOV T37.X, T8.X, +; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 ; EG-NEXT: MOV * T38.X, T32.X, ; EG-NEXT: MOV * T38.Z, T28.X, ; EG-NEXT: MOV T35.X, T24.X, diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 11c8fb2422ed0..e89c44d5b94a8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -622,32 +622,32 @@ entry: define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { ; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2: ; GCN-NOHSA-SI: ; %bb.0: ; %entry -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, s10 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, s11 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:4 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:6 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:8 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:10 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[4:7], 0 offset:12 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[4:7], 0 offset:14 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[4:7], 0 offset:18 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[4:7], 0 offset:20 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[4:7], 0 offset:22 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[4:7], 0 offset:24 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[4:7], 0 offset:26 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[4:7], 0 offset:28 -; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[4:7], 0 offset:30 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s7 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[8:11], 0 offset:4 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:6 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[8:11], 0 offset:8 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 offset:10 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[8:11], 0 offset:12 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[8:11], 0 offset:14 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[8:11], 0 offset:18 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[8:11], 0 offset:20 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[8:11], 0 offset:22 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[8:11], 0 offset:24 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[8:11], 0 offset:26 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[8:11], 0 offset:28 +; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[8:11], 0 offset:30 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8) ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 @@ -666,8 +666,8 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v13, v12 ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v5, v11, v10 ; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_load_v16i16_align2: @@ -2660,27 +2660,27 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 @@ -2689,64 +2689,64 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[16:19] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32: @@ -3068,22 +3068,22 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 @@ -3101,63 +3101,63 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[15:18] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v16, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v17, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[11:14] +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v13, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v11, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v6, v13, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v12, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v5, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[3:6] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32: @@ -3460,103 +3460,115 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v13 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v44, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v48, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v19 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v17 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v52, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v23 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v21 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v56, 0xffff, v20 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v24 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v62, 0xffff, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v24 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v30 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v25, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v55, 0xffff, v27 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v29, 0xffff, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v61, 0xffff, v32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v59, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v38 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v37 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v36 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload @@ -3590,10 +3602,10 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[26:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 @@ -3604,71 +3616,71 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v21 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v20 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v37, 16, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v36, 0xffff, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[34:37] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v23 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v35, 16, v23 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v34, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v32, 0xffff, v22 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[32:35] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v16 ; GCN-HSA-NEXT: v_and_b32_e32 v22, 0xffff, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v20, 0xffff, v16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v19 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[19:22] +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[19:22] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 @@ -3686,7 +3698,7 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[16:19] ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v4 @@ -3695,52 +3707,52 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[11:14] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[11:14] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v33 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v24, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v25, 0xffff, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v23, 0xffff, v32 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v30 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 +; GCN-HSA-NEXT: s_waitcnt vmcnt(10) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v28 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v30, 16, v25 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v24 +; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v27 +; GCN-HSA-NEXT: v_and_b32_e32 v29, 0xffff, v25 +; GCN-HSA-NEXT: v_and_b32_e32 v27, 0xffff, v24 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[27:30] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v35 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v21, 0xffff, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v34 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xffff, v26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v28 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v31 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] @@ -3772,102 +3784,112 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v28 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v18 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v19 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v17 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v23 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v23 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v22 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v21 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v20 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v27 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v24 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v25 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v24 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dword v19, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v20, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v21, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v11 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v10 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v48, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v49, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 16, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v53, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v57, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v62, 16, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v60, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v61, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v59, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v38 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v32, 16, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v31, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v42 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v35, 0xffff, v41 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -3892,35 +3914,35 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T53.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T53.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T48.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T46.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 1 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T41.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 22: -; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 -; EG-NEXT: VTX_READ_128 T38.XYZW, T35.X, 48, #1 -; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 32, #1 -; EG-NEXT: VTX_READ_128 T40.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T36.XYZW, T37.X, 0, #1 +; EG-NEXT: VTX_READ_128 T38.XYZW, T37.X, 48, #1 +; EG-NEXT: VTX_READ_128 T39.XYZW, T37.X, 32, #1 +; EG-NEXT: VTX_READ_128 T40.XYZW, T37.X, 16, #1 ; EG-NEXT: Fetch clause starting at 30: -; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1 -; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1 -; EG-NEXT: VTX_READ_128 T51.XYZW, T35.X, 80, #1 -; EG-NEXT: VTX_READ_128 T52.XYZW, T35.X, 64, #1 +; EG-NEXT: VTX_READ_128 T49.XYZW, T37.X, 112, #1 +; EG-NEXT: VTX_READ_128 T50.XYZW, T37.X, 96, #1 +; EG-NEXT: VTX_READ_128 T51.XYZW, T37.X, 80, #1 +; EG-NEXT: VTX_READ_128 T52.XYZW, T37.X, 64, #1 ; EG-NEXT: ALU clause starting at 38: -; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: MOV * T37.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 39: -; EG-NEXT: LSHR * T37.W, T36.W, literal.x, +; EG-NEXT: LSHR * T35.W, T36.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT * T37.Z, T36.W, literal.x, +; EG-NEXT: AND_INT * T35.Z, T36.W, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHR T37.Y, T36.Z, literal.x, +; EG-NEXT: LSHR T35.Y, T36.Z, literal.x, ; EG-NEXT: LSHR * T36.W, T36.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T37.X, T36.Z, literal.x, +; EG-NEXT: AND_INT T35.X, T36.Z, literal.x, ; EG-NEXT: AND_INT T36.Z, T36.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) @@ -3965,16 +3987,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: LSHR * T35.W, T38.W, literal.y, +; EG-NEXT: LSHR * T37.W, T38.W, literal.y, ; EG-NEXT: 64(8.968310e-44), 16(2.242078e-44) ; EG-NEXT: LSHR T48.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T35.Z, T38.W, literal.y, +; EG-NEXT: AND_INT * T37.Z, T38.W, literal.y, ; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) ; EG-NEXT: ALU clause starting at 96: -; EG-NEXT: LSHR T35.Y, T38.Z, literal.x, +; EG-NEXT: LSHR T37.Y, T38.Z, literal.x, ; EG-NEXT: LSHR * T38.W, T38.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T35.X, T38.Z, literal.x, +; EG-NEXT: AND_INT T37.X, T38.Z, literal.x, ; EG-NEXT: AND_INT T38.Z, T38.Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43) @@ -4253,14 +4275,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v10 @@ -4275,76 +4297,76 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v9, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v31 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v30 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v31, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v30, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v29 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v28 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v29, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v28, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v35 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v34 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v35, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v34, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v33 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v32 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v33, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v32, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v35 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v34 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v35, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v34, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v33 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v32 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v33, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v32, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v39 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v38 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v39, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v38, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v37 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v36 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v37, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v36, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v27 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v26 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v27, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v26, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v25 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v24 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v25, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v24, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v37 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v36 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v37, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v36, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v43 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v42 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v43, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v42, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v41 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v40 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v41, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v40, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v31 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v30 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v31, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v30, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v29 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v28 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v29, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v28, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v30, 16, v27 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 16, v26 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v26, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v24 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v25, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v24, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v23 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v22 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v23, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v21 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v20 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v20, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 16, v19 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 16, v18 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v19, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v15, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v14, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v12 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v12, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v17 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v16, 0, 16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload @@ -4386,14 +4408,14 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[24:25] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] +; GCN-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] @@ -4404,27 +4426,27 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v21 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v20 -; GCN-HSA-NEXT: v_bfe_i32 v30, v21, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v20, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v21 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v20 +; GCN-HSA-NEXT: v_bfe_i32 v34, v21, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v32, v20, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] +; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 16, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 16, v22 -; GCN-HSA-NEXT: v_bfe_i32 v30, v23, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v28, v22, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[28:31] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 16, v23 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 16, v22 +; GCN-HSA-NEXT: v_bfe_i32 v34, v23, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v32, v22, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[32:35] ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) @@ -4442,9 +4464,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v21, v19, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v19, v18, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[19:22] -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[19:22] +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) @@ -4458,7 +4480,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v22, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v20, v14, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v38, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[16:19] ; GCN-HSA-NEXT: flat_store_dwordx4 v[36:37], v[20:23] ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 @@ -4470,7 +4492,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[15:18] +; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[38:39], v[11:14] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 @@ -4500,38 +4522,38 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v26 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v28 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_bfe_i32 v15, v26, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v25 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 16, v24 -; GCN-HSA-NEXT: v_bfe_i32 v21, v25, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v24, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v33 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 16, v32 -; GCN-HSA-NEXT: v_bfe_i32 v25, v33, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v23, v32, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v19, v28, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v27 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v26 +; GCN-HSA-NEXT: v_bfe_i32 v2, v27, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v26, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v28, 16, v25 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 16, v24 +; GCN-HSA-NEXT: v_bfe_i32 v27, v25, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v25, v24, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[25:28] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 -; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v29 +; GCN-HSA-NEXT: v_bfe_i32 v21, v29, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v27 -; GCN-HSA-NEXT: v_bfe_i32 v17, v27, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v31 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v17, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v30, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] @@ -4561,102 +4583,112 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v3 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v2 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 16, v15 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v15, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v1 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v0 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v29 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v28 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v29, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v28, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v7 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v7, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v5 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v4 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v9 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v8 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v8, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v15, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v13 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v12 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v13, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v12, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v19 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v18 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v19, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v18, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v17 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v17, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v16, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v23, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v22, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v21 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v20 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v21, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v20, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v27 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v26 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v27, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v26, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v63, 16, v25 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 16, v24 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v62, v25, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v24, 0, 16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v31 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v30 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v31, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v30, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v17, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 16, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v13, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dword v14, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v15, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: buffer_store_dword v16, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v11 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v42, 16, v9 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v40, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v41, v9, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v50, 16, v1 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v48, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v49, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v0, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v36 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v35 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v35, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v46, 16, v5 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v44, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v45, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v54, 16, v26 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v52, 16, v25 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v53, v26, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v25, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v58, 16, v24 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v56, 16, v23 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v57, v24, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v23, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 16, v30 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 16, v29 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v30, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v29, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v62, 16, v28 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v60, 16, v27 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v61, v28, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v27, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v30, 16, v34 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 16, v33 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v34, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v33, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v32 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v31 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v32, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v31, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v34, 16, v38 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v32, 16, v37 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v38, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v31, v37, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload @@ -4878,7 +4910,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: ALU 82, @57, KC0[CB0:0-32], KC1[] ; CM-NEXT: ALU 72, @140, KC0[CB0:0-32], KC1[] ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T65, T66.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T36.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T35.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T64, T56.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T55.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T54.X @@ -4896,8 +4928,8 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 24: -; CM-NEXT: VTX_READ_128 T35.XYZW, T37.X, 16, #1 -; CM-NEXT: VTX_READ_128 T36.XYZW, T37.X, 0, #1 +; CM-NEXT: VTX_READ_128 T36.XYZW, T37.X, 16, #1 +; CM-NEXT: VTX_READ_128 T35.XYZW, T37.X, 0, #1 ; CM-NEXT: Fetch clause starting at 28: ; CM-NEXT: VTX_READ_128 T41.XYZW, T37.X, 112, #1 ; CM-NEXT: VTX_READ_128 T42.XYZW, T37.X, 96, #1 @@ -4914,22 +4946,22 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; CM-NEXT: LSHR T39.X, PV.W, literal.x, -; CM-NEXT: LSHR T0.Y, T36.Z, literal.y, -; CM-NEXT: LSHR T0.Z, T36.W, literal.y, +; CM-NEXT: LSHR T0.Y, T35.Z, literal.y, +; CM-NEXT: LSHR T0.Z, T35.W, literal.y, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: 192(2.690493e-43), 0(0.000000e+00) ; CM-NEXT: LSHR T40.X, PV.W, literal.x, -; CM-NEXT: LSHR T1.Y, T36.Y, literal.y, -; CM-NEXT: LSHR T1.Z, T35.Z, literal.y, -; CM-NEXT: LSHR * T0.W, T35.W, literal.y, +; CM-NEXT: LSHR T1.Y, T35.Y, literal.y, +; CM-NEXT: LSHR T1.Z, T36.Z, literal.y, +; CM-NEXT: LSHR * T0.W, T36.W, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; CM-NEXT: ALU clause starting at 57: -; CM-NEXT: LSHR T2.Z, T35.X, literal.x, +; CM-NEXT: LSHR T2.Z, T36.X, literal.x, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, ; CM-NEXT: 16(2.242078e-44), 208(2.914701e-43) ; CM-NEXT: LSHR T46.X, PV.W, literal.x, -; CM-NEXT: LSHR T2.Y, T35.Y, literal.y, +; CM-NEXT: LSHR T2.Y, T36.Y, literal.y, ; CM-NEXT: LSHR T3.Z, T37.Z, literal.y, ; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5051,31 +5083,31 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; CM-NEXT: BFE_INT T63.X, T37.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T37.Z, T35.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT T37.Z, T36.Y, 0.0, literal.x, ; CM-NEXT: BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T37.X, T35.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T37.X, T36.X, 0.0, literal.x, ; CM-NEXT: BFE_INT T45.Y, T4.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T64.Z, T35.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T64.Z, T36.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T63.W, T3.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T64.X, T35.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T64.X, T36.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: BFE_INT T35.Z, T36.Y, 0.0, literal.x, +; CM-NEXT: BFE_INT T36.Z, T35.Y, 0.0, literal.x, ; CM-NEXT: BFE_INT * T37.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T35.X, T36.X, 0.0, literal.x, +; CM-NEXT: BFE_INT T36.X, T35.X, 0.0, literal.x, ; CM-NEXT: BFE_INT T37.Y, T2.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T65.Z, T36.W, 0.0, literal.x, +; CM-NEXT: BFE_INT T65.Z, T35.W, 0.0, literal.x, ; CM-NEXT: BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T65.X, T36.Z, 0.0, literal.x, +; CM-NEXT: BFE_INT T65.X, T35.Z, 0.0, literal.x, ; CM-NEXT: BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: LSHR T1.Z, T36.X, literal.x, -; CM-NEXT: BFE_INT * T35.W, T1.Y, 0.0, literal.x, +; CM-NEXT: LSHR T1.Z, T35.X, literal.x, +; CM-NEXT: BFE_INT * T36.W, T1.Y, 0.0, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T36.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT T35.Y, PV.Z, 0.0, literal.y, +; CM-NEXT: LSHR T35.X, KC0[2].Y, literal.x, +; CM-NEXT: BFE_INT T36.Y, PV.Z, 0.0, literal.y, ; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y, ; CM-NEXT: BFE_INT * T65.W, T0.Z, 0.0, literal.y, ; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) @@ -5987,14 +6019,14 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 @@ -6004,8 +6036,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 @@ -6015,10 +6047,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: @@ -6872,12 +6904,12 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 @@ -6909,48 +6941,48 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v16, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: s_waitcnt vmcnt(2) -; GCN-HSA-NEXT: v_mov_b32_e32 v16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v12, v5, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v16, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: v_ashr_i64 v[14:15], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[6:7], 48 -; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 +; GCN-HSA-NEXT: v_bfe_i32 v6, v16, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-HSA-NEXT: v_bfe_i32 v4, v16, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; @@ -6964,62 +6996,62 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v10, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v27, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v15, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v19, v8, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v23, v14, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v7, 0, 16 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v5, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 +; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i16_to_v16i64: @@ -7213,139 +7245,113 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[2:5], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[6:9], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[19:22], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[25:28], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v22 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v22 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v1 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v62, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v20 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v27 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v26 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xffff, v28 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v30 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v32 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v32 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v36 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v33 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v57, 0xffff, v33 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v35 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v50, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v52, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v42, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v44, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v58, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v46, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, v24 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v63, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v60, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v61, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v62, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v63, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(4) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v60, 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -7516,95 +7522,95 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v56, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v47, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v58, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v38, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v41, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v55 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v37, 0xffff, v29 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v41, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v56 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v56 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v23, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v33 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v55, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v56 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v56 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v56 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v43, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v18 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, v55 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v52, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v55 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v19, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v47, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v55 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v50, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v55 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: @@ -7777,117 +7783,117 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; CM-NEXT: ALU 33, @31, KC0[], KC1[] ; CM-NEXT: TEX 0 @28 ; CM-NEXT: ALU 94, @65, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T50.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T50.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T49.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T48.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T47.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T46.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T46.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T45.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T44.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T43.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T42.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T42.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T41.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T40.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T39.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T38.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T38.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T37.X ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T34, T36.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T22.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T23.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 22: -; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1 -; CM-NEXT: VTX_READ_128 T21.XYZW, T19.X, 32, #1 -; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 16, #1 +; CM-NEXT: VTX_READ_128 T21.XYZW, T20.X, 0, #1 +; CM-NEXT: VTX_READ_128 T22.XYZW, T20.X, 32, #1 +; CM-NEXT: VTX_READ_128 T23.XYZW, T20.X, 16, #1 ; CM-NEXT: Fetch clause starting at 28: -; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 48, #1 +; CM-NEXT: VTX_READ_128 T23.XYZW, T20.X, 48, #1 ; CM-NEXT: ALU clause starting at 30: -; CM-NEXT: MOV * T19.X, KC0[2].Z, +; CM-NEXT: MOV * T20.X, KC0[2].Z, ; CM-NEXT: ALU clause starting at 31: -; CM-NEXT: LSHR * T23.Z, T20.Y, literal.x, +; CM-NEXT: LSHR * T19.Z, T21.Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT T23.X, T20.Y, literal.x, -; CM-NEXT: MOV T23.Y, 0.0, -; CM-NEXT: LSHR * T24.Z, T20.X, literal.y, +; CM-NEXT: AND_INT T19.X, T21.Y, literal.x, +; CM-NEXT: MOV T19.Y, 0.0, +; CM-NEXT: LSHR * T24.Z, T21.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T24.X, T20.X, literal.x, +; CM-NEXT: AND_INT T24.X, T21.X, literal.x, ; CM-NEXT: MOV T24.Y, 0.0, -; CM-NEXT: LSHR * T25.Z, T20.W, literal.y, +; CM-NEXT: LSHR * T25.Z, T21.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T25.X, T20.W, literal.x, +; CM-NEXT: AND_INT T25.X, T21.W, literal.x, ; CM-NEXT: MOV T25.Y, 0.0, -; CM-NEXT: LSHR * T26.Z, T20.Z, literal.y, +; CM-NEXT: LSHR * T26.Z, T21.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T26.X, T20.Z, literal.x, +; CM-NEXT: AND_INT T26.X, T21.Z, literal.x, ; CM-NEXT: MOV T26.Y, 0.0, -; CM-NEXT: LSHR * T20.Z, T22.Y, literal.y, +; CM-NEXT: LSHR * T21.Z, T23.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T20.X, T22.Y, literal.x, -; CM-NEXT: MOV T20.Y, 0.0, -; CM-NEXT: LSHR * T27.Z, T22.X, literal.y, +; CM-NEXT: AND_INT T21.X, T23.Y, literal.x, +; CM-NEXT: MOV T21.Y, 0.0, +; CM-NEXT: LSHR * T27.Z, T23.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T27.X, T22.X, literal.x, +; CM-NEXT: AND_INT T27.X, T23.X, literal.x, ; CM-NEXT: MOV T27.Y, 0.0, -; CM-NEXT: LSHR * T28.Z, T22.W, literal.y, +; CM-NEXT: LSHR * T28.Z, T23.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T28.X, T22.W, literal.x, +; CM-NEXT: AND_INT T28.X, T23.W, literal.x, ; CM-NEXT: MOV T28.Y, 0.0, -; CM-NEXT: LSHR * T29.Z, T22.Z, literal.y, +; CM-NEXT: LSHR * T29.Z, T23.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T29.X, T22.Z, literal.x, +; CM-NEXT: AND_INT T29.X, T23.Z, literal.x, ; CM-NEXT: MOV T29.Y, 0.0, -; CM-NEXT: LSHR * T19.Z, T21.Y, literal.y, +; CM-NEXT: LSHR * T20.Z, T22.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) ; CM-NEXT: ALU clause starting at 65: -; CM-NEXT: AND_INT T19.X, T21.Y, literal.x, -; CM-NEXT: MOV T19.Y, 0.0, -; CM-NEXT: LSHR * T30.Z, T21.X, literal.y, +; CM-NEXT: AND_INT T20.X, T22.Y, literal.x, +; CM-NEXT: MOV T20.Y, 0.0, +; CM-NEXT: LSHR * T30.Z, T22.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T30.X, T21.X, literal.x, +; CM-NEXT: AND_INT T30.X, T22.X, literal.x, ; CM-NEXT: MOV T30.Y, 0.0, -; CM-NEXT: LSHR * T31.Z, T21.W, literal.y, +; CM-NEXT: LSHR * T31.Z, T22.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T31.X, T21.W, literal.x, +; CM-NEXT: AND_INT T31.X, T22.W, literal.x, ; CM-NEXT: MOV T31.Y, 0.0, -; CM-NEXT: LSHR * T32.Z, T21.Z, literal.y, +; CM-NEXT: LSHR * T32.Z, T22.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T32.X, T21.Z, literal.x, +; CM-NEXT: AND_INT T32.X, T22.Z, literal.x, ; CM-NEXT: MOV T32.Y, 0.0, -; CM-NEXT: LSHR * T21.Z, T22.Y, literal.y, +; CM-NEXT: LSHR * T22.Z, T23.Y, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T21.X, T22.Y, literal.x, -; CM-NEXT: MOV T21.Y, 0.0, -; CM-NEXT: LSHR * T33.Z, T22.X, literal.y, +; CM-NEXT: AND_INT T22.X, T23.Y, literal.x, +; CM-NEXT: MOV T22.Y, 0.0, +; CM-NEXT: LSHR * T33.Z, T23.X, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T33.X, T22.X, literal.x, +; CM-NEXT: AND_INT T33.X, T23.X, literal.x, ; CM-NEXT: MOV T33.Y, 0.0, -; CM-NEXT: LSHR * T34.Z, T22.W, literal.y, +; CM-NEXT: LSHR * T34.Z, T23.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T34.X, T22.W, literal.x, +; CM-NEXT: AND_INT T34.X, T23.W, literal.x, ; CM-NEXT: MOV T34.Y, 0.0, -; CM-NEXT: LSHR * T35.Z, T22.Z, literal.y, +; CM-NEXT: LSHR * T35.Z, T23.Z, literal.y, ; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) -; CM-NEXT: AND_INT T35.X, T22.Z, literal.x, +; CM-NEXT: AND_INT T35.X, T23.Z, literal.x, ; CM-NEXT: MOV T35.Y, 0.0, -; CM-NEXT: MOV * T23.W, 0.0, +; CM-NEXT: MOV * T19.W, 0.0, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; CM-NEXT: MOV * T24.W, 0.0, ; CM-NEXT: MOV * T25.W, 0.0, ; CM-NEXT: MOV * T26.W, 0.0, -; CM-NEXT: MOV * T20.W, 0.0, +; CM-NEXT: MOV * T21.W, 0.0, ; CM-NEXT: MOV * T27.W, 0.0, ; CM-NEXT: MOV * T28.W, 0.0, ; CM-NEXT: MOV * T29.W, 0.0, -; CM-NEXT: MOV * T19.W, 0.0, +; CM-NEXT: MOV * T20.W, 0.0, ; CM-NEXT: MOV * T30.W, 0.0, ; CM-NEXT: MOV * T31.W, 0.0, ; CM-NEXT: MOV * T32.W, 0.0, -; CM-NEXT: MOV * T21.W, 0.0, +; CM-NEXT: MOV * T22.W, 0.0, ; CM-NEXT: MOV * T33.W, 0.0, ; CM-NEXT: MOV * T34.W, 0.0, ; CM-NEXT: MOV * T35.W, 0.0, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00) -; CM-NEXT: LSHR T22.X, PV.W, literal.x, +; CM-NEXT: LSHR T23.X, PV.W, literal.x, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43) ; CM-NEXT: LSHR T36.X, PV.W, literal.x, @@ -7957,45 +7963,45 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v15 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[2:3], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[2:3], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[0:1], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[0:1], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[6:7], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v22, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[6:7], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[4:5], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[4:5], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v10 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[10:11], 48 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[10:11], 48 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[8:9], 48 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[20:21], v[8:9], 48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v17, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v23, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[19:20], v[14:15], 48 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:48 @@ -8020,7 +8026,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v10, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v22, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v24, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 @@ -8058,7 +8064,7 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -8066,9 +8072,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 @@ -8091,8 +8097,8 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[4:5], 48 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[8:9], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 @@ -8100,73 +8106,73 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v11 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[6:7], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[10:11], 48 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v10 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] ; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x50 -; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v18, v9, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GCN-HSA-NEXT: v_bfe_i32 v16, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-HSA-NEXT: s_add_u32 s10, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[2:3], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[8:9], 48 -; GCN-HSA-NEXT: v_bfe_i32 v3, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v11 -; GCN-HSA-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[10:11], 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[12:13], 48 -; GCN-HSA-NEXT: v_bfe_i32 v3, v13, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[3:6] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 +; GCN-HSA-NEXT: v_bfe_i32 v7, v13, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[12:13], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v15 -; GCN-HSA-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[14:15], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 +; GCN-HSA-NEXT: v_bfe_i32 v7, v3, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[9:10], v[14:15], 48 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] ; GCN-HSA-NEXT: v_bfe_i32 v19, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v21, v1, 0, 16 @@ -8180,50 +8186,50 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[23:26] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_bfe_i32 v15, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v15, v6, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v17, v18, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_bfe_i32 v13, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v7, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v3, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v3, v12, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v12, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 969cf2c457b98..b4e9376d82777 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -1556,24 +1556,24 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; EG-NEXT: Fetch clause starting at 12: ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV T1.W, literal.x, -; EG-NEXT: SETNE_INT * T0.W, KC0[2].W, 0.0, +; EG-NEXT: MOV T0.W, literal.x, +; EG-NEXT: SETNE_INT * T1.W, KC0[2].W, 0.0, ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) ; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ; EG-NEXT: ALU clause starting at 18: -; EG-NEXT: MOV T0.W, KC0[2].W, +; EG-NEXT: MOV T1.W, KC0[2].W, ; EG-NEXT: MOV * T2.W, KC0[3].X, -; EG-NEXT: MOV T1.W, literal.x, +; EG-NEXT: MOV T0.W, literal.x, ; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 23: -; EG-NEXT: MOV T0.W, KC0[2].Y, -; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, +; EG-NEXT: MOV T1.W, KC0[2].Y, +; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0, ; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, ; EG-NEXT: ALU clause starting at 26: ; EG-NEXT: MOV * T0.X, KC0[2].Z, ; EG-NEXT: ALU clause starting at 27: -; EG-NEXT: LSHR * T1.X, T0.W, literal.x, +; EG-NEXT: LSHR * T1.X, T1.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp eq i32 %a, 0 @@ -1923,52 +1923,52 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; ; GFX9-LABEL: s_mul_i128: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s8, s7 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s6 -; GFX9-NEXT: s_mul_i32 s2, s10, s5 -; GFX9-NEXT: s_mul_hi_u32 s3, s10, s4 +; GFX9-NEXT: s_mul_i32 s0, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 +; GFX9-NEXT: s_mul_i32 s2, s14, s9 +; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s9, s6 +; GFX9-NEXT: s_mul_i32 s1, s13, s10 ; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: s_mul_i32 s3, s11, s4 +; GFX9-NEXT: s_mul_i32 s3, s15, s8 ; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s8, s6 +; GFX9-NEXT: s_mul_i32 s1, s12, s10 ; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_mul_i32 s3, s10, s4 +; GFX9-NEXT: s_mul_i32 s3, s14, s8 ; GFX9-NEXT: s_add_u32 s3, s3, s1 ; GFX9-NEXT: s_addc_u32 s2, s2, s0 -; GFX9-NEXT: s_mul_i32 s10, s5, s8 -; GFX9-NEXT: s_mul_hi_u32 s11, s4, s8 -; GFX9-NEXT: s_mul_hi_u32 s7, s5, s8 -; GFX9-NEXT: s_add_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_i32 s1, s4, s9 -; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: s_mul_hi_u32 s6, s4, s9 -; GFX9-NEXT: s_add_u32 s1, s1, s10 -; GFX9-NEXT: s_addc_u32 s6, s6, 0 -; GFX9-NEXT: s_add_u32 s6, s7, s6 -; GFX9-NEXT: s_addc_u32 s7, 0, 0 -; GFX9-NEXT: s_mul_hi_u32 s10, s5, s9 -; GFX9-NEXT: s_mul_i32 s5, s5, s9 -; GFX9-NEXT: s_add_u32 s5, s5, s6 -; GFX9-NEXT: s_addc_u32 s6, s10, s7 +; GFX9-NEXT: s_mul_i32 s14, s9, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 +; GFX9-NEXT: s_add_u32 s14, s14, s15 +; GFX9-NEXT: s_mul_i32 s1, s8, s13 +; GFX9-NEXT: s_addc_u32 s11, s11, 0 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 +; GFX9-NEXT: s_add_u32 s1, s1, s14 +; GFX9-NEXT: s_addc_u32 s10, s10, 0 +; GFX9-NEXT: s_add_u32 s10, s11, s10 +; GFX9-NEXT: s_addc_u32 s11, 0, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 +; GFX9-NEXT: s_mul_i32 s9, s9, s13 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_addc_u32 s10, s14, s11 ; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_add_u32 s5, s5, s3 -; GFX9-NEXT: s_addc_u32 s6, s6, s2 -; GFX9-NEXT: s_mul_i32 s2, s4, s8 +; GFX9-NEXT: s_add_u32 s9, s9, s3 +; GFX9-NEXT: s_addc_u32 s10, s10, s2 +; GFX9-NEXT: s_mul_i32 s2, s8, s12 ; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_i128: diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index ce1a9ad58f011..47d06fa30a01e 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -4,8 +4,8 @@ # is killed by that store. # GCN-LABEL: name: global_sextload_v32i32_to_v32i64 -# GCN: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) -# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr20, killed renamable $vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr0 +# GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) +# GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 --- name: global_sextload_v32i32_to_v32i64 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 9e6efc79d44f0..a462c19ce645d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -821,12 +821,12 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: .LBB1_2: ; %for.body ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[6:7], off offset:-2048 +; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc ; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6 @@ -847,8 +847,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_addk_i32 s4, 0x2000 ; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff ; GFX90A-NEXT: s_waitcnt vmcnt(8) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(7) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v18, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v19, v4, vcc @@ -869,10 +869,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: s_waitcnt vmcnt(1) ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v20, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v21, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v10, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v4, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index 8526efa18767d..415ed89668abb 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -91,20 +91,20 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; ; SI-IEEE-SAFE-LABEL: rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -129,24 +129,24 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -169,7 +169,7 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: rsq_f32: ; GCN-UNSAFE: ; %bb.0: @@ -605,20 +605,20 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; ; SI-IEEE-SAFE-LABEL: neg_rsq_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -643,24 +643,24 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: neg_rsq_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0 @@ -683,7 +683,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_f32: ; GCN-UNSAFE: ; %bb.0: @@ -786,20 +786,20 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; ; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; SI-IEEE-SAFE: ; %bb.0: -; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 ; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 ; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; SI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 ; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 @@ -824,24 +824,24 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-IEEE-SAFE-NEXT: s_endpgm ; ; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32: ; CI-IEEE-SAFE: ; %bb.0: -; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s11, 0xf000 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s10, -1 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s10 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s11 +; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7 ; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s6 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s7 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11 ; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000 ; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s8, s4 -; CI-IEEE-SAFE-NEXT: s_mov_b32 s9, s5 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8 +; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9 ; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0 ; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0 @@ -864,7 +864,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 -; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-IEEE-SAFE-NEXT: s_endpgm ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32: ; GCN-UNSAFE: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 09cd01af5bccb..d19a9233e118b 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -63,101 +63,101 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v23, s18, 28 ; CHECK-NEXT: v_writelane_b32 v23, s19, 29 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[2:3] +; CHECK-NEXT: ; def s[42:43] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s2, 30 -; CHECK-NEXT: v_writelane_b32 v23, s3, 31 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ; def s[52:55] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 32 -; CHECK-NEXT: v_writelane_b32 v23, s5, 33 -; CHECK-NEXT: v_writelane_b32 v23, s6, 34 -; CHECK-NEXT: v_writelane_b32 v23, s7, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 36 -; CHECK-NEXT: v_writelane_b32 v23, s5, 37 -; CHECK-NEXT: v_writelane_b32 v23, s6, 38 -; CHECK-NEXT: v_writelane_b32 v23, s7, 39 -; CHECK-NEXT: v_writelane_b32 v23, s8, 40 -; CHECK-NEXT: v_writelane_b32 v23, s9, 41 -; CHECK-NEXT: v_writelane_b32 v23, s10, 42 -; CHECK-NEXT: v_writelane_b32 v23, s11, 43 +; CHECK-NEXT: v_writelane_b32 v23, s4, 30 +; CHECK-NEXT: v_writelane_b32 v23, s5, 31 +; CHECK-NEXT: v_writelane_b32 v23, s6, 32 +; CHECK-NEXT: v_writelane_b32 v23, s7, 33 +; CHECK-NEXT: v_writelane_b32 v23, s8, 34 +; CHECK-NEXT: v_writelane_b32 v23, s9, 35 +; CHECK-NEXT: v_writelane_b32 v23, s10, 36 +; CHECK-NEXT: v_writelane_b32 v23, s11, 37 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:53] +; CHECK-NEXT: ; def s[40:41] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ; def s[36:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:43] +; CHECK-NEXT: ; def s[44:51] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 44 -; CHECK-NEXT: v_writelane_b32 v23, s1, 45 -; CHECK-NEXT: v_writelane_b32 v23, s2, 46 -; CHECK-NEXT: v_writelane_b32 v23, s3, 47 -; CHECK-NEXT: v_writelane_b32 v23, s4, 48 -; CHECK-NEXT: v_writelane_b32 v23, s5, 49 -; CHECK-NEXT: v_writelane_b32 v23, s6, 50 -; CHECK-NEXT: v_writelane_b32 v23, s7, 51 -; CHECK-NEXT: v_writelane_b32 v23, s8, 52 -; CHECK-NEXT: v_writelane_b32 v23, s9, 53 -; CHECK-NEXT: v_writelane_b32 v23, s10, 54 -; CHECK-NEXT: v_writelane_b32 v23, s11, 55 -; CHECK-NEXT: v_writelane_b32 v23, s12, 56 -; CHECK-NEXT: v_writelane_b32 v23, s13, 57 -; CHECK-NEXT: v_writelane_b32 v23, s14, 58 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s15, 59 +; CHECK-NEXT: v_writelane_b32 v23, s0, 38 +; CHECK-NEXT: v_writelane_b32 v23, s1, 39 +; CHECK-NEXT: v_writelane_b32 v23, s2, 40 +; CHECK-NEXT: v_writelane_b32 v23, s3, 41 +; CHECK-NEXT: v_writelane_b32 v23, s4, 42 +; CHECK-NEXT: v_writelane_b32 v23, s5, 43 +; CHECK-NEXT: v_writelane_b32 v23, s6, 44 +; CHECK-NEXT: v_writelane_b32 v23, s7, 45 +; CHECK-NEXT: v_writelane_b32 v23, s8, 46 +; CHECK-NEXT: v_writelane_b32 v23, s9, 47 +; CHECK-NEXT: v_writelane_b32 v23, s10, 48 +; CHECK-NEXT: v_writelane_b32 v23, s11, 49 +; CHECK-NEXT: v_writelane_b32 v23, s12, 50 +; CHECK-NEXT: v_writelane_b32 v23, s13, 51 +; CHECK-NEXT: v_writelane_b32 v23, s14, 52 +; CHECK-NEXT: v_writelane_b32 v23, s15, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:47] +; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 54 +; CHECK-NEXT: v_writelane_b32 v23, s1, 55 +; CHECK-NEXT: v_writelane_b32 v23, s2, 56 +; CHECK-NEXT: v_writelane_b32 v23, s3, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 60 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v23, s1, 61 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: v_writelane_b32 v23, s2, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 -; CHECK-NEXT: v_writelane_b32 v23, s3, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: v_writelane_b32 v23, s0, 58 +; CHECK-NEXT: v_writelane_b32 v23, s1, 59 +; CHECK-NEXT: v_writelane_b32 v23, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v23, s3, 61 +; CHECK-NEXT: v_writelane_b32 v23, s4, 62 +; CHECK-NEXT: v_writelane_b32 v0, s6, 0 +; CHECK-NEXT: v_writelane_b32 v23, s5, 63 +; CHECK-NEXT: v_writelane_b32 v0, s7, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 4 -; CHECK-NEXT: v_writelane_b32 v0, s1, 5 -; CHECK-NEXT: v_writelane_b32 v0, s2, 6 -; CHECK-NEXT: v_writelane_b32 v0, s3, 7 -; CHECK-NEXT: v_writelane_b32 v0, s4, 8 -; CHECK-NEXT: v_writelane_b32 v0, s5, 9 -; CHECK-NEXT: v_writelane_b32 v0, s6, 10 -; CHECK-NEXT: v_writelane_b32 v0, s7, 11 -; CHECK-NEXT: v_writelane_b32 v0, s8, 12 -; CHECK-NEXT: v_writelane_b32 v0, s9, 13 -; CHECK-NEXT: v_writelane_b32 v0, s10, 14 -; CHECK-NEXT: v_writelane_b32 v0, s11, 15 -; CHECK-NEXT: v_writelane_b32 v0, s12, 16 -; CHECK-NEXT: v_writelane_b32 v0, s13, 17 -; CHECK-NEXT: v_writelane_b32 v0, s14, 18 -; CHECK-NEXT: v_writelane_b32 v0, s15, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[54:55] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 2 +; CHECK-NEXT: v_writelane_b32 v0, s1, 3 +; CHECK-NEXT: v_writelane_b32 v0, s2, 4 +; CHECK-NEXT: v_writelane_b32 v0, s3, 5 +; CHECK-NEXT: v_writelane_b32 v0, s4, 6 +; CHECK-NEXT: v_writelane_b32 v0, s5, 7 +; CHECK-NEXT: v_writelane_b32 v0, s6, 8 +; CHECK-NEXT: v_writelane_b32 v0, s7, 9 +; CHECK-NEXT: v_writelane_b32 v0, s8, 10 +; CHECK-NEXT: v_writelane_b32 v0, s9, 11 +; CHECK-NEXT: v_writelane_b32 v0, s10, 12 +; CHECK-NEXT: v_writelane_b32 v0, s11, 13 +; CHECK-NEXT: v_writelane_b32 v0, s12, 14 +; CHECK-NEXT: v_writelane_b32 v0, s13, 15 +; CHECK-NEXT: v_writelane_b32 v0, s14, 16 +; CHECK-NEXT: v_writelane_b32 v0, s15, 17 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 18 +; CHECK-NEXT: v_writelane_b32 v0, s1, 19 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND @@ -245,102 +245,102 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v23, 30 ; CHECK-NEXT: v_readlane_b32 s1, v23, 31 +; CHECK-NEXT: v_readlane_b32 s2, v23, 32 +; CHECK-NEXT: v_readlane_b32 s3, v23, 33 +; CHECK-NEXT: v_readlane_b32 s4, v23, 34 +; CHECK-NEXT: v_readlane_b32 s5, v23, 35 +; CHECK-NEXT: v_readlane_b32 s6, v23, 36 +; CHECK-NEXT: v_readlane_b32 s7, v23, 37 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ; use s[42:43] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 32 -; CHECK-NEXT: v_readlane_b32 s1, v23, 33 -; CHECK-NEXT: v_readlane_b32 s2, v23, 34 -; CHECK-NEXT: v_readlane_b32 s3, v23, 35 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ; use s[52:55] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 36 -; CHECK-NEXT: v_readlane_b32 s1, v23, 37 -; CHECK-NEXT: v_readlane_b32 s2, v23, 38 -; CHECK-NEXT: v_readlane_b32 s3, v23, 39 -; CHECK-NEXT: v_readlane_b32 s4, v23, 40 -; CHECK-NEXT: v_readlane_b32 s5, v23, 41 -; CHECK-NEXT: v_readlane_b32 s6, v23, 42 -; CHECK-NEXT: v_readlane_b32 s7, v23, 43 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 44 -; CHECK-NEXT: v_readlane_b32 s1, v23, 45 -; CHECK-NEXT: v_readlane_b32 s2, v23, 46 -; CHECK-NEXT: v_readlane_b32 s3, v23, 47 -; CHECK-NEXT: v_readlane_b32 s4, v23, 48 -; CHECK-NEXT: v_readlane_b32 s5, v23, 49 -; CHECK-NEXT: v_readlane_b32 s6, v23, 50 -; CHECK-NEXT: v_readlane_b32 s7, v23, 51 +; CHECK-NEXT: v_readlane_b32 s0, v23, 38 +; CHECK-NEXT: v_readlane_b32 s1, v23, 39 +; CHECK-NEXT: v_readlane_b32 s2, v23, 40 +; CHECK-NEXT: v_readlane_b32 s3, v23, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:53] +; CHECK-NEXT: ; use s[40:41] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ; use s[36:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:43] +; CHECK-NEXT: ; use s[44:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s8, v23, 52 -; CHECK-NEXT: v_readlane_b32 s9, v23, 53 -; CHECK-NEXT: v_readlane_b32 s10, v23, 54 -; CHECK-NEXT: v_readlane_b32 s11, v23, 55 -; CHECK-NEXT: v_readlane_b32 s12, v23, 56 -; CHECK-NEXT: v_readlane_b32 s13, v23, 57 -; CHECK-NEXT: v_readlane_b32 s14, v23, 58 -; CHECK-NEXT: v_readlane_b32 s15, v23, 59 +; CHECK-NEXT: v_readlane_b32 s4, v23, 42 +; CHECK-NEXT: v_readlane_b32 s5, v23, 43 +; CHECK-NEXT: v_readlane_b32 s6, v23, 44 +; CHECK-NEXT: v_readlane_b32 s7, v23, 45 +; CHECK-NEXT: v_readlane_b32 s8, v23, 46 +; CHECK-NEXT: v_readlane_b32 s9, v23, 47 +; CHECK-NEXT: v_readlane_b32 s10, v23, 48 +; CHECK-NEXT: v_readlane_b32 s11, v23, 49 +; CHECK-NEXT: v_readlane_b32 s12, v23, 50 +; CHECK-NEXT: v_readlane_b32 s13, v23, 51 +; CHECK-NEXT: v_readlane_b32 s14, v23, 52 +; CHECK-NEXT: v_readlane_b32 s15, v23, 53 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 60 -; CHECK-NEXT: v_readlane_b32 s1, v23, 61 -; CHECK-NEXT: v_readlane_b32 s2, v23, 62 -; CHECK-NEXT: v_readlane_b32 s3, v23, 63 -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 -; CHECK-NEXT: v_readlane_b32 s6, v0, 2 -; CHECK-NEXT: v_readlane_b32 s7, v0, 3 +; CHECK-NEXT: v_readlane_b32 s0, v23, 54 +; CHECK-NEXT: v_readlane_b32 s1, v23, 55 +; CHECK-NEXT: v_readlane_b32 s2, v23, 56 +; CHECK-NEXT: v_readlane_b32 s3, v23, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 58 +; CHECK-NEXT: v_readlane_b32 s1, v23, 59 +; CHECK-NEXT: v_readlane_b32 s2, v23, 60 +; CHECK-NEXT: v_readlane_b32 s3, v23, 61 +; CHECK-NEXT: v_readlane_b32 s4, v23, 62 +; CHECK-NEXT: v_readlane_b32 s5, v23, 63 +; CHECK-NEXT: v_readlane_b32 s6, v0, 0 +; CHECK-NEXT: v_readlane_b32 s7, v0, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 4 -; CHECK-NEXT: v_readlane_b32 s1, v0, 5 -; CHECK-NEXT: v_readlane_b32 s2, v0, 6 -; CHECK-NEXT: v_readlane_b32 s3, v0, 7 -; CHECK-NEXT: v_readlane_b32 s4, v0, 8 -; CHECK-NEXT: v_readlane_b32 s5, v0, 9 -; CHECK-NEXT: v_readlane_b32 s6, v0, 10 -; CHECK-NEXT: v_readlane_b32 s7, v0, 11 -; CHECK-NEXT: v_readlane_b32 s8, v0, 12 -; CHECK-NEXT: v_readlane_b32 s9, v0, 13 -; CHECK-NEXT: v_readlane_b32 s10, v0, 14 -; CHECK-NEXT: v_readlane_b32 s11, v0, 15 -; CHECK-NEXT: v_readlane_b32 s12, v0, 16 -; CHECK-NEXT: v_readlane_b32 s13, v0, 17 -; CHECK-NEXT: v_readlane_b32 s14, v0, 18 -; CHECK-NEXT: v_readlane_b32 s15, v0, 19 +; CHECK-NEXT: v_readlane_b32 s0, v0, 2 +; CHECK-NEXT: v_readlane_b32 s1, v0, 3 +; CHECK-NEXT: v_readlane_b32 s2, v0, 4 +; CHECK-NEXT: v_readlane_b32 s3, v0, 5 +; CHECK-NEXT: v_readlane_b32 s4, v0, 6 +; CHECK-NEXT: v_readlane_b32 s5, v0, 7 +; CHECK-NEXT: v_readlane_b32 s6, v0, 8 +; CHECK-NEXT: v_readlane_b32 s7, v0, 9 +; CHECK-NEXT: v_readlane_b32 s8, v0, 10 +; CHECK-NEXT: v_readlane_b32 s9, v0, 11 +; CHECK-NEXT: v_readlane_b32 s10, v0, 12 +; CHECK-NEXT: v_readlane_b32 s11, v0, 13 +; CHECK-NEXT: v_readlane_b32 s12, v0, 14 +; CHECK-NEXT: v_readlane_b32 s13, v0, 15 +; CHECK-NEXT: v_readlane_b32 s14, v0, 16 +; CHECK-NEXT: v_readlane_b32 s15, v0, 17 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 18 +; CHECK-NEXT: v_readlane_b32 s1, v0, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 20 ; CHECK-NEXT: v_readlane_b32 s1, v0, 21 ; CHECK-NEXT: v_readlane_b32 s2, v0, 22 ; CHECK-NEXT: v_readlane_b32 s3, v0, 23 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[54:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v0, 24 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 974cb71900a4d..d59660751cc18 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -16,17 +16,17 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 @@ -56,22 +56,22 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 @@ -101,56 +101,56 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 -; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: s_ashr_i32 s3, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: s_ashr_i32 s6, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_xor_b32 s7, s0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_ashr_i32 s5, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s3, s1, s3 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s1, 0, s2 +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s5, 0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s1, s1, s8 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 -; GFX9-NEXT: s_add_i32 s8, s8, s1 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s8 -; GFX9-NEXT: s_mul_i32 s8, s1, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s8 -; GFX9-NEXT: s_add_i32 s9, s1, 1 -; GFX9-NEXT: s_sub_i32 s8, s0, s2 -; GFX9-NEXT: s_cmp_ge_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_cselect_b32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s8, s1, 1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s2 -; GFX9-NEXT: s_cselect_b32 s0, s8, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mul_i32 s5, s5, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s5 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s5, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 +; GFX9-NEXT: s_add_i32 s9, s5, 1 +; GFX9-NEXT: s_sub_i32 s8, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s4, s8, s5 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_i32: @@ -1373,17 +1373,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; ; TONGA-LABEL: sdiv_v4i32_4: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 @@ -1401,7 +1401,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32_4: @@ -1672,20 +1672,20 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i23: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 ; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 ; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; TONGA-NEXT: s_waitcnt vmcnt(2) @@ -1710,25 +1710,25 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 -; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i23: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 ; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 ; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1753,7 +1753,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i23: @@ -1859,20 +1859,20 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; TONGA-LABEL: v_sdiv_i24: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 ; TONGA-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; TONGA-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 ; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(3) ; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; TONGA-NEXT: s_waitcnt vmcnt(2) @@ -1895,25 +1895,25 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 -; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i24: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 ; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; GFX9-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 ; GFX9-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i24: @@ -1997,17 +1997,17 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: v_sdiv_i25: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_bfe_i32 v2, v1, 0, 25 ; GCN-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2040,22 +2040,22 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: v_sdiv_i25: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_mov_b32 s1, s5 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_bfe_i32 v2, v1, 0, 25 ; TONGA-NEXT: v_bfe_i32 v1, v1, 24, 1 @@ -2088,59 +2088,59 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 -; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: v_sdiv_i25: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: s_bfe_i32 s3, s2, 0x190000 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10018 -; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_bfe_i32 s1, s0, 0x190000 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0x10018 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x10018 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v1 -; GFX9-NEXT: s_add_i32 s1, s1, s0 -; GFX9-NEXT: s_xor_b32 s2, s0, s2 -; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: s_bfe_i32 s5, s4, 0x190000 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x10018 +; GFX9-NEXT: s_add_i32 s5, s5, s4 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s1, 0, s3 +; GFX9-NEXT: s_xor_b32 s6, s4, s6 +; GFX9-NEXT: s_xor_b32 s4, s5, s4 +; GFX9-NEXT: s_sub_i32 s5, 0, s7 ; GFX9-NEXT: v_readfirstlane_b32 s8, v0 -; GFX9-NEXT: s_mul_i32 s1, s1, s8 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 -; GFX9-NEXT: s_add_i32 s8, s8, s1 -; GFX9-NEXT: s_mul_hi_u32 s1, s0, s8 -; GFX9-NEXT: s_mul_i32 s8, s1, s3 -; GFX9-NEXT: s_sub_i32 s0, s0, s8 -; GFX9-NEXT: s_add_i32 s9, s1, 1 -; GFX9-NEXT: s_sub_i32 s8, s0, s3 -; GFX9-NEXT: s_cmp_ge_u32 s0, s3 -; GFX9-NEXT: s_cselect_b32 s1, s9, s1 -; GFX9-NEXT: s_cselect_b32 s0, s8, s0 -; GFX9-NEXT: s_add_i32 s8, s1, 1 -; GFX9-NEXT: s_cmp_ge_u32 s0, s3 -; GFX9-NEXT: s_cselect_b32 s0, s8, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_bfe_i32 s0, s0, 0x190000 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mul_i32 s5, s5, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s5 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s5, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 +; GFX9-NEXT: s_add_i32 s9, s5, 1 +; GFX9-NEXT: s_sub_i32 s8, s4, s7 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s7 +; GFX9-NEXT: s_cselect_b32 s4, s8, s5 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0x190000 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v_sdiv_i25: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index da8896db90494..705a2af739590 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -239,14 +239,14 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v6, v3 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v5, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 +; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v2, vcc ; GCN-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GCN-NEXT: v_rcp_f32_e32 v5, v5 ; GCN-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 @@ -318,33 +318,33 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v5 +; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_mul_lo_u32 v9, v2, v5 +; GCN-NEXT: v_mul_lo_u32 v9, v3, v5 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, v1, v8 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; GCN-NEXT: v_subb_u32_e64 v9, s[4:5], v10, v3, vcc -; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v2 +; GCN-NEXT: v_subb_u32_e64 v9, s[4:5], v10, v2, vcc +; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v3 ; GCN-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v9, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 ; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3 ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v2 ; GCN-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5] ; GCN-NEXT: v_add_i32_e64 v10, s[4:5], 2, v5 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GCN-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v6, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_add_i32_e64 v12, s[4:5], 1, v5 ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GCN-NEXT: v_addc_u32_e64 v13, s[4:5], 0, v6, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 ; GCN-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v9, v13, v11, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index b03353972ab66..adce63c7e45e7 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -692,17 +692,17 @@ define amdgpu_kernel void @select_v2f16( ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 -; GFX11-NEXT: s_mov_b32 s14, -1 -; GFX11-NEXT: s_mov_b32 s15, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s14 -; GFX11-NEXT: s_mov_b32 s3, s15 -; GFX11-NEXT: s_mov_b32 s22, s14 -; GFX11-NEXT: s_mov_b32 s23, s15 -; GFX11-NEXT: s_mov_b32 s18, s14 -; GFX11-NEXT: s_mov_b32 s19, s15 -; GFX11-NEXT: s_mov_b32 s26, s14 -; GFX11-NEXT: s_mov_b32 s27, s15 +; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s2 +; GFX11-NEXT: s_mov_b32 s15, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s26, s2 +; GFX11-NEXT: s_mov_b32 s27, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s20, s8 ; GFX11-NEXT: s_mov_b32 s21, s9 @@ -710,12 +710,12 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_mov_b32 s17, s7 ; GFX11-NEXT: s_mov_b32 s24, s10 ; GFX11-NEXT: s_mov_b32 s25, s11 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 ; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 ; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 ; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 -; GFX11-NEXT: s_mov_b32 s12, s4 -; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(2) @@ -730,7 +730,7 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index 3a5638344d751..eaf00e861ce97 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -15,8 +15,11 @@ ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 +; SGPR-NEXT: s_or_saveexec_b64 s[100:101], -1 +; SGPR-NEXT: s_mov_b64 exec, s[100:101] +; SGPR-NEXT: s_nop 2 +; SGPR-NEXT: buffer_store_dword v0, off, s[{{[0-9]+}}:[[HI]]], 0 ; SGPR-NEXT: ; kill: killed $vgpr1 -; SGPR-NEXT: s_nop 4 ; ALL: s_endpgm define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index 02c18720d05f5..08db1e7fee259 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10191,19 +10191,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x80800 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 ; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10225,9 +10217,17 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:48 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: v_mov_b32_e32 v7, 1 ; GFX6-NEXT: s_mov_b64 s[2:3], exec ; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen @@ -10496,25 +10496,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84c00 -; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(4) +; GFX6-NEXT: v_mov_b32_e32 v0, v17 +; GFX6-NEXT: v_mov_b32_e32 v1, v18 +; GFX6-NEXT: v_mov_b32_e32 v2, v19 +; GFX6-NEXT: v_mov_b32_e32 v3, v20 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x84400 +; GFX6-NEXT: v_mov_b32_e32 v20, v3 ; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: v_mov_b32_e32 v19, v2 +; GFX6-NEXT: v_mov_b32_e32 v18, v1 +; GFX6-NEXT: v_mov_b32_e32 v17, v0 ; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10634,20 +10632,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80800 +; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80c00 +; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 ; GFX6-NEXT: buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:32 -; GFX6-NEXT: s_waitcnt expcnt(2) ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll index c056d35c56beb..314785cdbefd6 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll @@ -136,13 +136,13 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 { ; GFX908-DAG: v_accvgpr_read_b32 ; GFX900: NumVgprs: 256 -; GFX908: NumVgprs: 252 -; GFX900: ScratchSize: 1668 +; GFX908: NumVgprs: 254 +; GFX900: ScratchSize: 1796 ; GFX908: ScratchSize: 0 ; GFX900: VGPRBlocks: 63 -; GFX908: VGPRBlocks: 62 +; GFX908: VGPRBlocks: 63 ; GFX900: NumVGPRsForWavesPerEU: 256 -; GFX908: NumVGPRsForWavesPerEU: 252 +; GFX908: NumVGPRsForWavesPerEU: 254 define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(ptr addrspace(1) %p) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %p1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %p, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index c85d15bc2fcff..613349f32e2d5 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -218,14 +218,14 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v5, v4 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v2, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -297,34 +297,34 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v8, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v3, vcc -; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v2, vcc +; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v3 ; GCN-NEXT: v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v2 ; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] -; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 -; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v2 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v2, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] -; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v2 +; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v3 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] @@ -357,25 +357,25 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6 ; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1 ; GCN-IR-NEXT: v_min_u32_e32 v11, v6, v7 -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[6:7], v10, v11 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v10, v11 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[7:8] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[7:8] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v4 ; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v7 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v6 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 @@ -425,15 +425,15 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7 -; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 ; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v7, v2, v9 -; GCN-IR-NEXT: v_mul_hi_u32 v8, v2, v6 -; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v6 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v6, v2, v9 +; GCN-IR-NEXT: v_mul_hi_u32 v7, v2, v8 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v8 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v8 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v4 @@ -1584,22 +1584,22 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[3:4] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 @@ -1647,14 +1647,14 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 -; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 +; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 ; GCN-IR-NEXT: .LBB11_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 -; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v3 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/swdev373493.ll b/llvm/test/CodeGen/AMDGPU/swdev373493.ll index 4d1d88d643f15..4f33e19835172 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev373493.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev373493.ll @@ -22,16 +22,16 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a ; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 ; CHECK-NEXT: ; %bb.2: ; %bb7 ; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, global@rel32@lo+1948 -; CHECK-NEXT: s_addc_u32 s17, s17, global@rel32@hi+1956 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, s16 -; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_getpc_b64 s[18:19] -; CHECK-NEXT: s_add_u32 s18, s18, eggs@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s19, s19, eggs@rel32@hi+12 -; CHECK-NEXT: s_setpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, global@rel32@lo+1948 +; CHECK-NEXT: s_addc_u32 s19, s19, global@rel32@hi+1956 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s18 +; CHECK-NEXT: v_mov_b32_e32 v1, s19 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, eggs@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, eggs@rel32@hi+12 +; CHECK-NEXT: s_setpc_b64 s[16:17] ; CHECK-NEXT: .LBB0_3: ; %LeafBlock1 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb8 diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index b7d3a96042569..7f989e30118b9 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -89,15 +89,13 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-NEXT: s_mov_b32 s1, s3 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] ; CHECK-NEXT: v_writelane_b32 v2, s0, 7 -; CHECK-NEXT: v_writelane_b32 v2, s1, 8 -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: v_writelane_b32 v2, s1, 8 ; CHECK-NEXT: v_readlane_b32 s0, v2, 0 ; CHECK-NEXT: v_readlane_b32 s2, v2, 11 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] ; CHECK-NEXT: s_add_i32 s2, s2, s0 ; CHECK-NEXT: v_writelane_b32 v2, s2, 11 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] ; CHECK-NEXT: v_readlane_b32 s0, v2, 11 ; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 88a49cfc6e669..cf30131b8ab58 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -818,9 +818,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: flat_load_dwordx4 v[6:9], v[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1 @@ -855,51 +855,51 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 ; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 -; GCN-NEXT: v_mul_hi_u32 v11, v5, v11 -; GCN-NEXT: v_mul_hi_u32 v12, v6, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v7, v13 +; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 +; GCN-NEXT: v_mul_hi_u32 v11, v7, v11 +; GCN-NEXT: v_mul_hi_u32 v12, v8, v12 +; GCN-NEXT: v_mul_hi_u32 v13, v9, v13 ; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 ; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 ; GCN-NEXT: v_mul_lo_u32 v20, v13, v3 -; GCN-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 -; GCN-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 -; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v18 -; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v20 +; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14 +; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 +; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18 +; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v20 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 ; GCN-NEXT: v_add_u32_e32 v19, vcc, 1, v12 ; GCN-NEXT: v_add_u32_e32 v21, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; GCN-NEXT: v_sub_u32_e32 v14, vcc, v4, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1 +; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v6, v0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] -; GCN-NEXT: v_sub_u32_e32 v15, vcc, v5, v1 +; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_sub_u32_e32 v16, vcc, v6, v2 +; GCN-NEXT: v_sub_u32_e32 v16, vcc, v8, v2 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] -; GCN-NEXT: v_sub_u32_e32 v17, vcc, v7, v3 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, v9, v3 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3] ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5] ; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7] ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v14, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v16, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 ; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm ; ; GFX1030-LABEL: udiv_v4i32: @@ -1848,20 +1848,20 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; SI-LABEL: v_udiv_i24: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -1890,25 +1890,25 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_udiv_i24: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 ; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -1937,7 +1937,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: v_udiv_i24: diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 972e03b8960d6..894c96acbbcd6 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -333,24 +333,24 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v5, s[6:7], v8, v9 +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v8, v9 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[5:6] +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v6, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 @@ -400,15 +400,15 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 -; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 ; GCN-IR-NEXT: .LBB1_6: ; %Flow5 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v5, v2, v7 -; GCN-IR-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v4 -; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v4, v2, v7 +; GCN-IR-NEXT: v_mul_hi_u32 v5, v2, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v6 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index b22ae9b9c8527..b8d18f56b7602 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -334,8 +334,8 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; GFX906-NEXT: v_lshrrev_b32_e32 v32, 24, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v6 @@ -369,8 +369,8 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_lshrrev_b32_e32 v26, 16, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v27, 8, v8 ; GFX906-NEXT: v_lshrrev_b32_e32 v28, 24, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GFX906-NEXT: v_lshrrev_b32_e32 v30, 8, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX906-NEXT: v_lshrrev_b32_e32 v29, 8, v7 ; GFX906-NEXT: v_lshrrev_b32_e32 v32, 24, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX906-NEXT: v_lshrrev_b32_e32 v31, 8, v6 @@ -378,10 +378,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX906-NEXT: v_lshlrev_b16_e32 v28, 8, v28 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v32 -; GFX906-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v31 +; GFX906-NEXT: v_or_b32_sdwa v28, v30, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v31 ; GFX906-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX906-NEXT: v_or_b32_sdwa v6, v6, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v27 ; GFX906-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -409,7 +409,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX906-NEXT: v_or_b32_sdwa v4, v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v30 +; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29 ; GFX906-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v10 ; GFX906-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -859,15 +859,15 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v4 ; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 24, v2 ; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; GFX906-NEXT: buffer_store_dword v63, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 16, v2 ; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 24, v2 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v3 -; GFX906-NEXT: buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX906-NEXT: buffer_store_dword v63, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v2 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX906-NEXT: s_cbranch_execz .LBB6_2 ; GFX906-NEXT: ; %bb.1: ; %bb.1 @@ -1287,16 +1287,16 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 24, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v3 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX906-NEXT: v_lshrrev_b32_e32 v62, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v63, 8, v2 ; GFX906-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill ; GFX906-NEXT: .LBB6_2: ; %bb.2 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v62 +; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v63 ; GFX906-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v63 +; GFX906-NEXT: v_lshlrev_b16_e32 v2, 8, v62 ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX906-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload ; GFX906-NEXT: buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index e274fc2592149..94b822ac48875 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -523,22 +523,22 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: s_mov_b32 s4, 0 -; GFX1032-NEXT: ; implicit-def: $sgpr3 +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: ; implicit-def: $sgpr4 ; GFX1032-NEXT: s_branch .LBB11_4 ; GFX1032-NEXT: .LBB11_2: ; %bb8 ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_add_i32 s4, s4, 1 +; GFX1032-NEXT: s_add_i32 s3, s3, 1 ; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] -; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s4, v1 +; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 ; GFX1032-NEXT: s_add_u32 s0, s0, 4 ; GFX1032-NEXT: s_addc_u32 s1, s1, 0 -; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo -; GFX1032-NEXT: s_or_b32 s3, s3, s5 +; GFX1032-NEXT: s_or_b32 s4, s4, s5 ; GFX1032-NEXT: .LBB11_3: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: s_and_b32 s5, exec_lo, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 ; GFX1032-NEXT: s_or_b32 s2, s5, s2 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 @@ -546,12 +546,12 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) # ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] -; GFX1032-NEXT: s_or_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 ; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 ; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 -; GFX1032-NEXT: ; implicit-def: $sgpr4 +; GFX1032-NEXT: ; implicit-def: $sgpr3 ; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1032-NEXT: s_branch .LBB11_3 ; GFX1032-NEXT: .LBB11_6: ; %.loopexit @@ -1803,64 +1803,64 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-NEXT: s_branch .LBB33_2 ; GFX1032-NEXT: .LBB33_1: ; %body ; GFX1032-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1032-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1032-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1032-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1032-NEXT: s_cbranch_execz .LBB33_4 ; GFX1032-NEXT: .LBB33_2: ; %loop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v4 +; GFX1032-NEXT: v_mov_b32_e32 v1, v5 +; GFX1032-NEXT: v_mov_b32_e32 v2, v6 +; GFX1032-NEXT: v_mov_b32_e32 v3, v7 ; GFX1032-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1032-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1032-NEXT: ; implicit-def: $vgpr8 ; GFX1032-NEXT: .LBB33_4: ; %break ; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, v4 -; GFX1032-NEXT: v_mov_b32_e32 v1, v5 -; GFX1032-NEXT: v_mov_b32_e32 v2, v6 -; GFX1032-NEXT: v_mov_b32_e32 v3, v7 ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_loop_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_wqm_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-NEXT: s_branch .LBB33_2 ; GFX1064-NEXT: .LBB33_1: ; %body ; GFX1064-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1064-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1064-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1064-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1064-NEXT: s_cbranch_execz .LBB33_4 ; GFX1064-NEXT: .LBB33_2: ; %loop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_cmp_lt_f32_e32 vcc, 0x40e00000, v8 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v4 +; GFX1064-NEXT: v_mov_b32_e32 v1, v5 +; GFX1064-NEXT: v_mov_b32_e32 v2, v6 +; GFX1064-NEXT: v_mov_b32_e32 v3, v7 ; GFX1064-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1064-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1064-NEXT: ; implicit-def: $vgpr8 ; GFX1064-NEXT: .LBB33_4: ; %break ; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, v4 -; GFX1064-NEXT: v_mov_b32_e32 v1, v5 -; GFX1064-NEXT: v_mov_b32_e32 v2, v6 -; GFX1064-NEXT: v_mov_b32_e32 v3, v7 ; GFX1064-NEXT: ; return to shader part epilog entry: br label %loop diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll index 6bf1a5ed21454..9d7570b9a929e 100644 --- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll @@ -350,16 +350,16 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: r13:12 = add(r5:4,r7:6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p0 = cmp.gtu(r5:4,r3:2) -; CHECK-NEXT: p1 = cmp.eq(r5:4,r9:8) +; CHECK-NEXT: p1 = cmp.gtu(r5:4,r3:2) +; CHECK-NEXT: p0 = cmp.eq(r5:4,r9:8) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r1 = mux(p0,r2,r12) -; CHECK-NEXT: r14 = mux(p0,r3,r13) +; CHECK-NEXT: r1 = mux(p1,r2,r12) +; CHECK-NEXT: r14 = mux(p1,r3,r13) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r10 = mux(p1,r2,r1) -; CHECK-NEXT: r11 = mux(p1,r3,r14) +; CHECK-NEXT: r10 = mux(p0,r2,r1) +; CHECK-NEXT: r11 = mux(p0,r3,r14) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: memd_locked(r0,p0) = r11:10 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll index 800f89005e310..66db73f5c69f6 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll @@ -493,7 +493,7 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r4 = ##-2147483648 ; CHECK-NEXT: r3:2 = combine(#1,#8) -; CHECK-NEXT: v4 = vmem(r0+#0) +; CHECK-NEXT: v5 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vsplat(r4) @@ -504,11 +504,11 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r7) ; CHECK-NEXT: r5 = #32 -; CHECK-NEXT: v8.w = vasl(v6.w,r3) -; CHECK-NEXT: v6.cur = vmem(r0+#1) +; CHECK-NEXT: v8.w = vasl(v4.w,r3) +; CHECK-NEXT: v4.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v4.w,r3) +; CHECK-NEXT: v7.w = vasl(v5.w,r3) ; CHECK-NEXT: v12 = vxor(v12,v12) ; CHECK-NEXT: v8.w = vsub(v8.w,v1.w) ; CHECK-NEXT: v0 = vmem(r0+#3) @@ -517,11 +517,11 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: v11.w = vasl(v0.w,r3) ; CHECK-NEXT: v7.w = vsub(v7.w,v1.w) -; CHECK-NEXT: q0 = vcmp.gt(v12.w,v4.w) +; CHECK-NEXT: q0 = vcmp.gt(v12.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasl(v2.w,r3) -; CHECK-NEXT: q1 = vcmp.gt(v12.w,v6.w) +; CHECK-NEXT: q1 = vcmp.gt(v12.w,v4.w) ; CHECK-NEXT: v11.w = vsub(v11.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -530,82 +530,82 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v8.w = vasr(v8.w,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21 = vsplat(r3) +; CHECK-NEXT: v22 = vsplat(r3) ; CHECK-NEXT: v7.w = vasr(v7.w,r6) -; CHECK-NEXT: v18.w = vsub(v9.w,v1.w) +; CHECK-NEXT: v19.w = vsub(v9.w,v1.w) ; CHECK-NEXT: v8.w = vsub(v10.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.w = vasl(v6.w,r2) -; CHECK-NEXT: v26 = vmux(q1,v1,v21) -; CHECK-NEXT: v24 = vmux(q0,v1,v21) +; CHECK-NEXT: v20.w = vasl(v4.w,r2) +; CHECK-NEXT: v27 = vmux(q1,v1,v22) +; CHECK-NEXT: v25 = vmux(q0,v1,v22) ; CHECK-NEXT: v7.w = vsub(v10.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v4.w,r2) +; CHECK-NEXT: v6.w = vasl(v5.w,r2) ; CHECK-NEXT: v8.w = vmin(v8.w,v13.w) -; CHECK-NEXT: v9 = vor(v19,v1) -; CHECK-NEXT: v20.w = vmin(v7.w,v13.w) +; CHECK-NEXT: v9 = vor(v20,v1) +; CHECK-NEXT: v21.w = vmin(v7.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasr(v18.w,r6) +; CHECK-NEXT: v5.w = vasr(v19.w,r6) ; CHECK-NEXT: q3 = vcmp.gt(v8.w,v12.w) -; CHECK-NEXT: v5 = vor(v5,v1) -; CHECK-NEXT: q2 = vcmp.gt(v20.w,v12.w) +; CHECK-NEXT: v6 = vor(v6,v1) +; CHECK-NEXT: q2 = vcmp.gt(v21.w,v12.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.w = vasr(v11.w,r6) -; CHECK-NEXT: v4.w = vsub(v10.w,v4.w) +; CHECK-NEXT: v5.w = vsub(v10.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.w = vasl(v2.w,r2) ; CHECK-NEXT: v10.w = vsub(v10.w,v11.w) -; CHECK-NEXT: v4.w = vmin(v4.w,v13.w) +; CHECK-NEXT: v5.w = vmin(v5.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22.w = vasl(v0.w,r2) +; CHECK-NEXT: v23.w = vasl(v0.w,r2) ; CHECK-NEXT: v3 = vor(v3,v1) ; CHECK-NEXT: v10.w = vmin(v10.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.w = vlsr(v9.w,v8.w) -; CHECK-NEXT: v6 = vor(v22,v1) +; CHECK-NEXT: v4 = vor(v23,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vlsr(v5.w,v20.w) -; CHECK-NEXT: v25.w = vsub(v12.w,v8.w) +; CHECK-NEXT: v6.w = vlsr(v6.w,v21.w) +; CHECK-NEXT: v26.w = vsub(v12.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vlsr(v3.w,v4.w) -; CHECK-NEXT: v23.w = vsub(v12.w,v5.w) -; CHECK-NEXT: v8 = vmux(q1,v25,v8) +; CHECK-NEXT: v3.w = vlsr(v3.w,v5.w) +; CHECK-NEXT: v24.w = vsub(v12.w,v6.w) +; CHECK-NEXT: v8 = vmux(q1,v26,v8) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vlsr(v6.w,v10.w) -; CHECK-NEXT: v5 = vmux(q0,v23,v5) +; CHECK-NEXT: v4.w = vlsr(v4.w,v10.w) +; CHECK-NEXT: v6 = vmux(q0,v24,v6) ; CHECK-NEXT: q0 = vcmp.gt(v12.w,v2.w) -; CHECK-NEXT: v27.w = vsub(v12.w,v3.w) +; CHECK-NEXT: v28.w = vsub(v12.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vmux(q3,v8,v26) -; CHECK-NEXT: v28.w = vsub(v12.w,v6.w) +; CHECK-NEXT: v2 = vmux(q3,v8,v27) +; CHECK-NEXT: v29.w = vsub(v12.w,v4.w) ; CHECK-NEXT: q3 = vcmp.gt(v12.w,v0.w) -; CHECK-NEXT: v5 = vmux(q2,v5,v24) +; CHECK-NEXT: v6 = vmux(q2,v6,v25) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29 = vmux(q0,v1,v21) -; CHECK-NEXT: v3 = vmux(q0,v27,v3) -; CHECK-NEXT: q2 = vcmp.gt(v4.w,v12.w) -; CHECK-NEXT: v30 = vmux(q3,v28,v6) +; CHECK-NEXT: v30 = vmux(q0,v1,v22) +; CHECK-NEXT: v3 = vmux(q0,v28,v3) +; CHECK-NEXT: q2 = vcmp.gt(v5.w,v12.w) +; CHECK-NEXT: v4 = vmux(q3,v29,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vpack(v2.w,v5.w):sat -; CHECK-NEXT: v1 = vmux(q3,v1,v21) +; CHECK-NEXT: v2.h = vpack(v2.w,v6.w):sat +; CHECK-NEXT: v1 = vmux(q3,v1,v22) ; CHECK-NEXT: q3 = vcmp.gt(v10.w,v12.w) -; CHECK-NEXT: v0 = vmux(q2,v3,v29) +; CHECK-NEXT: v0 = vmux(q2,v3,v30) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vmux(q3,v30,v1) +; CHECK-NEXT: v1 = vmux(q3,v4,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.h = vpack(v1.w,v0.w):sat @@ -1547,55 +1547,59 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(##-2147483648,#8) -; CHECK-NEXT: r4 = #1 +; CHECK-NEXT: r3:2 = combine(#8,#1) +; CHECK-NEXT: r4 = ##-2147483648 ; CHECK-NEXT: v5 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vsplat(r3) +; CHECK-NEXT: v3 = vsplat(r4) ; CHECK-NEXT: r5 = #30 ; CHECK-NEXT: r6 = #24 ; CHECK-NEXT: v2 = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14 = vsplat(r5) -; CHECK-NEXT: v8.w = vasl(v5.w,r4) -; CHECK-NEXT: v13 = vxor(v13,v13) +; CHECK-NEXT: r4 = #32 +; CHECK-NEXT: v8.w = vasl(v5.w,r2) ; CHECK-NEXT: v0 = vmem(r0+#3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r7 = #64 -; CHECK-NEXT: v9.w = vasl(v2.w,r4) +; CHECK-NEXT: v9.w = vasl(v2.w,r2) +; CHECK-NEXT: v13 = vxor(v13,v13) ; CHECK-NEXT: v8.w = vsub(v8.w,v3.w) ; CHECK-NEXT: v1 = vmem(r0+#2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.w = vasl(v0.w,r4) -; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w) +; CHECK-NEXT: v20 = vsplat(r4) +; CHECK-NEXT: v12.w = vasl(v0.w,r2) ; CHECK-NEXT: v9.w = vsub(v9.w,v3.w) -; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w) +; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r4 = #32 -; CHECK-NEXT: v11.w = vasl(v1.w,r4) +; CHECK-NEXT: v11.w = vasl(v1.w,r2) +; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w) ; CHECK-NEXT: v12.w = vsub(v12.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20 = vsplat(r4) -; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: r2 = ##2147483647 +; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: v11.w = vsub(v11.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v22 = vsplat(r2) +; CHECK-NEXT: v8.w = vasr(v8.w,r6) +; CHECK-NEXT: } +; CHECK-NEXT: { ; CHECK-NEXT: v9.w = vasr(v9.w,r6) ; CHECK-NEXT: v8.w = vsub(v14.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.w = vasl(v5.w,r2) +; CHECK-NEXT: v6.w = vasl(v5.w,r3) ; CHECK-NEXT: v9.w = vsub(v14.w,v9.w) ; CHECK-NEXT: v8.w = vmin(v8.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v2.w,r2) +; CHECK-NEXT: v7.w = vasl(v2.w,r3) ; CHECK-NEXT: v6 = vor(v6,v3) ; CHECK-NEXT: v9.w = vmin(v9.w,v20.w) ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v8.w) @@ -1610,17 +1614,15 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v5.w = vsub(v14.w,v19.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v1.w,r2) +; CHECK-NEXT: v4.w = vasl(v1.w,r3) ; CHECK-NEXT: v21.w = vsub(v14.w,v12.w) ; CHECK-NEXT: v5.w = vmin(v5.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r2 = ##2147483647 -; CHECK-NEXT: v10.w = vasl(v0.w,r2) +; CHECK-NEXT: v10.w = vasl(v0.w,r3) ; CHECK-NEXT: v4 = vor(v4,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22 = vsplat(r2) ; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w) ; CHECK-NEXT: v3 = vor(v10,v3) ; CHECK-NEXT: v10.w = vmin(v21.w,v20.w) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll index 4c651ae474053..5cfa09b0822bb 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll @@ -19,9 +19,9 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.h = vunpack(v0.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vsplat(r7) +; CHECK-NEXT: v2.h = vsplat(r7) ; CHECK-NEXT: r3:2 = combine(#31,#5) -; CHECK-NEXT: v2.h = vabs(v0.h) +; CHECK-NEXT: v3.h = vabs(v0.h) ; CHECK-NEXT: v4.h = vabs(v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -31,62 +31,62 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r5 = ##32768 -; CHECK-NEXT: v5.uh = vcl0(v2.uh) +; CHECK-NEXT: v5.uh = vcl0(v3.uh) ; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10.h = vsplat(r5) ; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: v6.uh = vcl0(v4.uh) -; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) +; CHECK-NEXT: v5.h = vadd(v5.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vmux(q0,v10,v9) -; CHECK-NEXT: v6.h = vadd(v6.h,v3.h) +; CHECK-NEXT: v6.h = vadd(v6.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vasl(v2.h,v5.h) +; CHECK-NEXT: v3.h = vasl(v3.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.h = vasl(v4.h,v6.h) -; CHECK-NEXT: v13 = vand(v2,v8) -; CHECK-NEXT: v11.h = vadd(v2.h,v7.h) +; CHECK-NEXT: v13 = vand(v3,v8) +; CHECK-NEXT: v11.h = vadd(v3.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.h = vadd(v4.h,v7.h) ; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h) ; CHECK-NEXT: v8 = vand(v4,v8) -; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v11.uh) +; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2) -; CHECK-NEXT: v13 = vmux(q2,v9,v3) +; CHECK-NEXT: v13 = vmux(q2,v9,v2) ; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h) ; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2) -; CHECK-NEXT: v22 = vmux(q2,v9,v3) -; CHECK-NEXT: v21 = vmux(q1,v3,v9) -; CHECK-NEXT: v3 = vmux(q3,v3,v9) +; CHECK-NEXT: v22 = vmux(q2,v9,v2) +; CHECK-NEXT: v21 = vmux(q1,v2,v9) +; CHECK-NEXT: v2 = vmux(q3,v2,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2) ; CHECK-NEXT: v13.h = vadd(v11.h,v13.h) ; CHECK-NEXT: v24.h = vadd(v20.h,v22.h) -; CHECK-NEXT: v3.h = vadd(v3.h,v7.h) +; CHECK-NEXT: v2.h = vadd(v2.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uh = vlsr(v2.uh,r2) +; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2) ; CHECK-NEXT: v23.h = vadd(v21.h,v7.h) -; CHECK-NEXT: v3.h = vsub(v3.h,v6.h) +; CHECK-NEXT: v2.h = vsub(v2.h,v6.h) ; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7) -; CHECK-NEXT: v2.h = vsub(v23.h,v5.h) -; CHECK-NEXT: q1 = vcmp.eq(v12.h,v11.h) -; CHECK-NEXT: q2 = vcmp.eq(v19.h,v20.h) +; CHECK-NEXT: v3.h = vsub(v23.h,v5.h) +; CHECK-NEXT: q2 = vcmp.eq(v12.h,v11.h) +; CHECK-NEXT: q1 = vcmp.eq(v19.h,v20.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uh = vlsr(v13.uh,r7) @@ -95,29 +95,29 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uh = vlsr(v24.uh,r7) -; CHECK-NEXT: v5 = vmux(q1,v25,v11) +; CHECK-NEXT: v5 = vmux(q2,v25,v11) +; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.uh = vlsr(v20.uh,r7) ; CHECK-NEXT: v5 = vor(v27,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vasl(v2.h,r4) -; CHECK-NEXT: v4 = vmux(q2,v26,v4) -; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h) +; CHECK-NEXT: v3.h = vasl(v3.h,r4) +; CHECK-NEXT: v4 = vmux(q1,v26,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.h = vasl(v3.h,r4) +; CHECK-NEXT: v2.h = vasl(v2.h,r4) ; CHECK-NEXT: v4 = vor(v28,v4) -; CHECK-NEXT: v29 = vor(v5,v2) +; CHECK-NEXT: v29 = vor(v5,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3 = vor(v4,v3) +; CHECK-NEXT: v2 = vor(v4,v2) ; CHECK-NEXT: v31 = vmux(q3,v9,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v9,v3) +; CHECK-NEXT: v30 = vmux(q2,v9,v2) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -236,190 +236,188 @@ define void @s8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v4 = vsplat(r7) ; CHECK-NEXT: r6 = ##-2147483648 -; CHECK-NEXT: v15 = vxor(v15,v15) +; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16 = vsplat(r6) -; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v7:6.h = vunpack(v1.b) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v18 = vsplat(r5) +; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: v1:0.w = vunpack(v8.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7:6.w = vunpack(v6.h) -; CHECK-NEXT: v8.w = vabs(v1.w) ; CHECK-NEXT: v5.w = vabs(v0.w) +; CHECK-NEXT: v10.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vabs(v6.w) -; CHECK-NEXT: v11.w = vabs(v7.w) -; CHECK-NEXT: q0 = vcmp.gt(v15.w,v6.w) +; CHECK-NEXT: v26.w = vabs(v6.w) +; CHECK-NEXT: v13.w = vabs(v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vcl0(v8.uw) -; CHECK-NEXT: v17 = vmux(q0,v16,v15) -; CHECK-NEXT: q0 = vcmp.gt(v15.w,v7.w) +; CHECK-NEXT: v9.uw = vcl0(v5.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.uw = vcl0(v9.uw) -; CHECK-NEXT: v12.w = vadd(v12.w,v2.w) +; CHECK-NEXT: v12.uw = vcl0(v26.uw) +; CHECK-NEXT: v9.w = vadd(v9.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vcl0(v11.uw) -; CHECK-NEXT: v13.w = vadd(v13.w,v2.w) +; CHECK-NEXT: v14.uw = vcl0(v13.uw) +; CHECK-NEXT: v15.w = vadd(v12.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10.uw = vcl0(v5.uw) -; CHECK-NEXT: v14.w = vadd(v14.w,v2.w) +; CHECK-NEXT: v11.uw = vcl0(v10.uw) +; CHECK-NEXT: v12.w = vadd(v14.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v9.w,v13.w) -; CHECK-NEXT: v10.w = vadd(v10.w,v2.w) +; CHECK-NEXT: v27.w = vasl(v26.w,v15.w) +; CHECK-NEXT: v11.w = vadd(v11.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v11.w,v14.w) -; CHECK-NEXT: v20 = vand(v9,v4) -; CHECK-NEXT: v19.w = vadd(v9.w,v3.w) +; CHECK-NEXT: v13.w = vasl(v13.w,v12.w) +; CHECK-NEXT: v20 = vand(v27,v4) +; CHECK-NEXT: v19.w = vadd(v27.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v8.w,v12.w) -; CHECK-NEXT: v23.w = vadd(v11.w,v3.w) -; CHECK-NEXT: q3 = vcmp.eq(v20.w,v15.w) -; CHECK-NEXT: v28 = vand(v11,v4) +; CHECK-NEXT: v16.w = vasl(v5.w,v9.w) +; CHECK-NEXT: v5 = vxor(v5,v5) +; CHECK-NEXT: v23.w = vadd(v13.w,v3.w) +; CHECK-NEXT: v28 = vand(v13,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q3,v15,v2) -; CHECK-NEXT: q3 = vcmp.eq(v28.w,v15.w) -; CHECK-NEXT: v22 = vand(v8,v4) -; CHECK-NEXT: q2 = vcmp.gt(v9.uw,v19.uw) +; CHECK-NEXT: v17.w = vasl(v10.w,v11.w) +; CHECK-NEXT: q3 = vcmp.eq(v20.w,v5.w) +; CHECK-NEXT: q2 = vcmp.gt(v27.uw,v19.uw) +; CHECK-NEXT: q0 = vcmp.gt(v5.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v9.uw,r2) -; CHECK-NEXT: v27 = vmux(q3,v15,v2) -; CHECK-NEXT: q1 = vcmp.eq(v22.w,v15.w) -; CHECK-NEXT: v24 = vmux(q2,v2,v15) +; CHECK-NEXT: v21.uw = vlsr(v27.uw,r2) +; CHECK-NEXT: v30 = vmux(q3,v5,v2) +; CHECK-NEXT: q3 = vcmp.eq(v28.w,v5.w) +; CHECK-NEXT: v22 = vand(v17,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.uw = vlsr(v19.uw,r2) -; CHECK-NEXT: v26 = vmux(q1,v15,v2) -; CHECK-NEXT: v13.w = vsub(v24.w,v13.w) +; CHECK-NEXT: v14.uw = vlsr(v19.uw,r2) +; CHECK-NEXT: v27 = vmux(q3,v5,v2) +; CHECK-NEXT: q1 = vcmp.eq(v22.w,v5.w) +; CHECK-NEXT: v24 = vmux(q2,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v31.uw = vlsr(v23.uw,r2) -; CHECK-NEXT: v22.w = vadd(v9.w,v30.w) -; CHECK-NEXT: v30.w = vadd(v8.w,v3.w) -; CHECK-NEXT: q2 = vcmp.eq(v21.w,v9.w) +; CHECK-NEXT: v22.w = vadd(v14.w,v30.w) +; CHECK-NEXT: v30.w = vadd(v17.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v21.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v5.w,v10.w) +; CHECK-NEXT: v29.uw = vlsr(v13.uw,r2) ; CHECK-NEXT: v28.w = vadd(v31.w,v27.w) -; CHECK-NEXT: v13.w = vadd(v13.w,v18.w) +; CHECK-NEXT: v3.w = vadd(v16.w,v3.w) +; CHECK-NEXT: v4 = vand(v16,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v11.uw,r2) -; CHECK-NEXT: v3.w = vadd(v5.w,v3.w) -; CHECK-NEXT: v4 = vand(v5,v4) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: v14.uw = vlsr(v14.uw,r0) ; CHECK-NEXT: q3 = vcmp.eq(v29.w,v31.w) -; CHECK-NEXT: v31 = vmux(q0,v16,v15) -; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v3.uw) +; CHECK-NEXT: v18 = vmux(q0,v8,v5) +; CHECK-NEXT: q0 = vcmp.gt(v5.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: v26 = vmux(q1,v5,v2) +; CHECK-NEXT: v31 = vmux(q0,v8,v5) +; CHECK-NEXT: q0 = vcmp.gt(v16.uw,v3.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.uw = vlsr(v9.uw,r0) -; CHECK-NEXT: v19 = vmux(q3,v20,v19) -; CHECK-NEXT: q3 = vcmp.eq(v4.w,v15.w) +; CHECK-NEXT: v10 = vsplat(r5) +; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: v15.w = vsub(v24.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0) -; CHECK-NEXT: v19 = vor(v31,v19) +; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v14 = vmux(q2,v29,v14) +; CHECK-NEXT: q2 = vcmp.gt(v13.uw,v23.uw) +; CHECK-NEXT: v15.w = vadd(v15.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2) -; CHECK-NEXT: v9 = vmux(q2,v29,v9) -; CHECK-NEXT: q2 = vcmp.gt(v11.uw,v23.uw) -; CHECK-NEXT: v29 = vmux(q3,v15,v2) +; CHECK-NEXT: v19 = vmux(q3,v20,v19) +; CHECK-NEXT: q3 = vcmp.eq(v4.w,v5.w) +; CHECK-NEXT: v27 = vmux(q2,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) -; CHECK-NEXT: v27 = vmux(q2,v2,v15) -; CHECK-NEXT: q2 = vcmp.gt(v8.uw,v30.uw) +; CHECK-NEXT: q2 = vcmp.gt(v17.uw,v30.uw) ; CHECK-NEXT: v28.w = vadd(v25.w,v26.w) +; CHECK-NEXT: v29 = vmux(q3,v5,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.uw = vlsr(v8.uw,r2) -; CHECK-NEXT: v31 = vmux(q2,v2,v15) -; CHECK-NEXT: v2 = vmux(q0,v2,v15) -; CHECK-NEXT: v30.w = vadd(v3.w,v29.w) +; CHECK-NEXT: v17.uw = vlsr(v17.uw,r2) +; CHECK-NEXT: v19 = vor(v31,v19) +; CHECK-NEXT: v31 = vmux(q2,v2,v5) +; CHECK-NEXT: v2 = vmux(q0,v2,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) -; CHECK-NEXT: v2.w = vsub(v2.w,v10.w) -; CHECK-NEXT: q3 = vcmp.eq(v8.w,v25.w) -; CHECK-NEXT: v22.w = vsub(v31.w,v12.w) +; CHECK-NEXT: v24.uw = vlsr(v16.uw,r2) +; CHECK-NEXT: v30.w = vadd(v3.w,v29.w) +; CHECK-NEXT: v2.w = vsub(v2.w,v9.w) +; CHECK-NEXT: v11.w = vsub(v31.w,v11.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vlsr(v28.uw,r0) -; CHECK-NEXT: v4.w = vsub(v27.w,v14.w) -; CHECK-NEXT: v8.w = vadd(v22.w,v18.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v18.w) +; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: q3 = vcmp.eq(v17.w,v25.w) +; CHECK-NEXT: v4.w = vsub(v27.w,v12.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v10.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vlsr(v25.uw,r0) +; CHECK-NEXT: v13.uw = vlsr(v25.uw,r0) ; CHECK-NEXT: q0 = vcmp.eq(v24.w,v3.w) -; CHECK-NEXT: q2 = vcmp.gt(v15.w,v1.w) -; CHECK-NEXT: v4.w = vadd(v4.w,v18.w) +; CHECK-NEXT: v21.w = vadd(v11.w,v10.w) +; CHECK-NEXT: q2 = vcmp.gt(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v30.uw,r0) -; CHECK-NEXT: v5 = vmux(q3,v5,v11) -; CHECK-NEXT: q3 = vcmp.gt(v15.w,v0.w) -; CHECK-NEXT: v24 = vmux(q2,v16,v15) +; CHECK-NEXT: v22.uw = vlsr(v30.uw,r0) +; CHECK-NEXT: v23 = vmux(q3,v16,v13) +; CHECK-NEXT: q3 = vcmp.gt(v5.w,v0.w) +; CHECK-NEXT: v24 = vmux(q2,v8,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v3.uw = vlsr(v3.uw,r0) -; CHECK-NEXT: v25 = vmux(q3,v16,v15) -; CHECK-NEXT: v5 = vor(v24,v5) -; CHECK-NEXT: v9 = vor(v17,v9) +; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) +; CHECK-NEXT: v8 = vmux(q3,v8,v5) +; CHECK-NEXT: v10 = vor(v24,v23) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v8.w,r4) -; CHECK-NEXT: v3 = vmux(q0,v23,v3) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v15.w) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v15.w) +; CHECK-NEXT: v9.w = vasl(v21.w,r4) +; CHECK-NEXT: v3 = vmux(q0,v22,v3) +; CHECK-NEXT: v14 = vor(v18,v14) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v2.w = vasl(v2.w,r4) -; CHECK-NEXT: v3 = vor(v25,v3) -; CHECK-NEXT: v5 = vor(v5,v8) +; CHECK-NEXT: v3 = vor(v8,v3) +; CHECK-NEXT: v25 = vor(v10,v9) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.w = vasl(v13.w,r4) +; CHECK-NEXT: v15.w = vasl(v15.w,r4) ; CHECK-NEXT: v2 = vor(v3,v2) -; CHECK-NEXT: v27 = vmux(q2,v15,v5) +; CHECK-NEXT: v27 = vmux(q2,v5,v25) ; CHECK-NEXT: vmem(r1+#1) = v27.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.w = vasl(v4.w,r4) -; CHECK-NEXT: v29 = vmux(q3,v15,v2) -; CHECK-NEXT: q2 = vcmp.eq(v7.w,v15.w) +; CHECK-NEXT: v29 = vmux(q3,v5,v2) +; CHECK-NEXT: q2 = vcmp.eq(v7.w,v5.w) ; CHECK-NEXT: vmem(r1+#0) = v29.new ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28 = vor(v19,v26) -; CHECK-NEXT: v30 = vor(v9,v13) -; CHECK-NEXT: q3 = vcmp.eq(v6.w,v15.w) +; CHECK-NEXT: v30 = vor(v14,v15) +; CHECK-NEXT: q3 = vcmp.eq(v6.w,v5.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q2,v15,v28) -; CHECK-NEXT: v31 = vmux(q3,v15,v30) +; CHECK-NEXT: v0 = vmux(q2,v5,v28) +; CHECK-NEXT: v31 = vmux(q3,v5,v30) ; CHECK-NEXT: vmem(r1+#3) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -740,25 +738,25 @@ define void @s16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r7 = #1 -; CHECK-NEXT: r3:2 = combine(#31,#64) +; CHECK-NEXT: r3:2 = combine(#31,#1) +; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: v1.h = vabs(v0.h) ; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vsplat(r7) +; CHECK-NEXT: v2.h = vsplat(r2) ; CHECK-NEXT: v5.h = vsplat(r3) ; CHECK-NEXT: r6 = #5 ; CHECK-NEXT: v3 = vxor(v3,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.h = vsplat(r2) +; CHECK-NEXT: v6.h = vsplat(r7) ; CHECK-NEXT: r4 = ##32768 ; CHECK-NEXT: v4.uh = vcl0(v1.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.h = vsplat(r4) -; CHECK-NEXT: r3 = #10 +; CHECK-NEXT: r4 = #10 ; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h) ; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) ; CHECK-NEXT: } @@ -789,15 +787,15 @@ define void @s16f16_1(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uh = vlsr(v25.uh,r7) +; CHECK-NEXT: v27.uh = vlsr(v25.uh,r2) ; CHECK-NEXT: v28.h = vsub(v2.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uh = vlsr(v7.uh,r7) +; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.h = vasl(v28.h,r3) -; CHECK-NEXT: q3 = vsetq(r2) +; CHECK-NEXT: v1.h = vasl(v28.h,r4) +; CHECK-NEXT: q3 = vsetq(r7) ; CHECK-NEXT: v2 = vmux(q3,v29,v27) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -835,9 +833,9 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.w = vunpack(v0.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vsplat(r0) +; CHECK-NEXT: v3 = vsplat(r0) ; CHECK-NEXT: r7 = #512 -; CHECK-NEXT: v3.w = vabs(v0.w) +; CHECK-NEXT: v4.w = vabs(v0.w) ; CHECK-NEXT: v6.w = vabs(v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -849,60 +847,60 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: r6 = ##-2147483648 -; CHECK-NEXT: v7.uw = vcl0(v3.uw) +; CHECK-NEXT: v7.uw = vcl0(v4.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r6) ; CHECK-NEXT: v8.uw = vcl0(v6.uw) ; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w) -; CHECK-NEXT: v7.w = vadd(v7.w,v4.w) +; CHECK-NEXT: v7.w = vadd(v7.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #23 -; CHECK-NEXT: v8.w = vadd(v8.w,v4.w) +; CHECK-NEXT: v8.w = vadd(v8.w,v3.w) ; CHECK-NEXT: v27 = vmux(q0,v10,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v3.w,v7.w) +; CHECK-NEXT: v4.w = vasl(v4.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,v8.w) -; CHECK-NEXT: v11.w = vadd(v3.w,v5.w) -; CHECK-NEXT: v12 = vand(v3,v9) +; CHECK-NEXT: v11.w = vadd(v4.w,v5.w) +; CHECK-NEXT: v12 = vand(v4,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.w = vadd(v6.w,v5.w) ; CHECK-NEXT: v9 = vand(v6,v9) ; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w) -; CHECK-NEXT: q2 = vcmp.gt(v3.uw,v11.uw) +; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2) ; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w) -; CHECK-NEXT: v23 = vmux(q1,v2,v4) -; CHECK-NEXT: v14 = vmux(q2,v4,v2) +; CHECK-NEXT: v23 = vmux(q1,v2,v3) +; CHECK-NEXT: v14 = vmux(q2,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2) ; CHECK-NEXT: v11.w = vadd(v22.w,v23.w) ; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw) -; CHECK-NEXT: v25 = vmux(q3,v2,v4) +; CHECK-NEXT: v25 = vmux(q3,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v3.uw,r2) +; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2) ; CHECK-NEXT: v5.w = vadd(v24.w,v25.w) -; CHECK-NEXT: v4 = vmux(q2,v4,v2) +; CHECK-NEXT: v3 = vmux(q2,v3,v2) ; CHECK-NEXT: v7.w = vsub(v14.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) +; CHECK-NEXT: v3.w = vsub(v3.w,v8.w) ; CHECK-NEXT: q3 = vcmp.eq(v21.w,v22.w) ; CHECK-NEXT: v7.w = vadd(v7.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.uw = vlsr(v22.uw,r0) -; CHECK-NEXT: v4.w = vadd(v4.w,v13.w) +; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0) +; CHECK-NEXT: v3.w = vadd(v3.w,v13.w) ; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -910,13 +908,13 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0) -; CHECK-NEXT: v3 = vmux(q3,v11,v3) +; CHECK-NEXT: v4 = vmux(q3,v11,v4) ; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0) ; CHECK-NEXT: v28 = vmux(q3,v10,v2) -; CHECK-NEXT: v3 = vor(v27,v3) +; CHECK-NEXT: v4 = vor(v27,v4) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -925,17 +923,17 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vasl(v4.w,r4) +; CHECK-NEXT: v3.w = vasl(v3.w,r4) ; CHECK-NEXT: v5 = vor(v28,v5) -; CHECK-NEXT: v29 = vor(v3,v7) +; CHECK-NEXT: v29 = vor(v4,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vor(v5,v4) +; CHECK-NEXT: v3 = vor(v5,v3) ; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v2,v4) +; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -1044,114 +1042,114 @@ define void @s32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 -; CHECK-NEXT: v3.w = vabs(v1.w) +; CHECK-NEXT: v6.w = vabs(v1.w) ; CHECK-NEXT: v1.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vsplat(r2) +; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: r4 = #512 -; CHECK-NEXT: v2.w = vabs(v0.w) +; CHECK-NEXT: v5.w = vabs(v0.w) ; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9 = vsplat(r4) ; CHECK-NEXT: v8 = vsplat(r6) -; CHECK-NEXT: v6.uw = vcl0(v3.uw) -; CHECK-NEXT: v7 = vxor(v7,v7) +; CHECK-NEXT: v3.uw = vcl0(v6.uw) +; CHECK-NEXT: v20 = vxor(v20,v20) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 -; CHECK-NEXT: v5.uw = vcl0(v2.uw) -; CHECK-NEXT: v6.w = vadd(v6.w,v4.w) +; CHECK-NEXT: v4.uw = vcl0(v5.uw) +; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27 = vsplat(r4) ; CHECK-NEXT: r5 = ##-2147483648 -; CHECK-NEXT: v5.w = vadd(v5.w,v4.w) +; CHECK-NEXT: v7.w = vadd(v4.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) -; CHECK-NEXT: v3.w = vasl(v3.w,v6.w) -; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,v5.w) -; CHECK-NEXT: v26 = vmux(q0,v13,v7) -; CHECK-NEXT: v10.w = vadd(v3.w,v8.w) -; CHECK-NEXT: v11 = vand(v3,v9) +; CHECK-NEXT: v6.w = vasl(v6.w,v3.w) +; CHECK-NEXT: q0 = vcmp.gt(v20.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9 = vand(v2,v9) -; CHECK-NEXT: q1 = vcmp.eq(v11.w,v7.w) -; CHECK-NEXT: v8.w = vadd(v2.w,v8.w) -; CHECK-NEXT: q2 = vcmp.gt(v3.uw,v10.uw) +; CHECK-NEXT: v5.w = vasl(v5.w,v7.w) +; CHECK-NEXT: v26 = vmux(q0,v13,v20) +; CHECK-NEXT: v10.w = vadd(v6.w,v8.w) +; CHECK-NEXT: v11 = vand(v6,v9) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vlsr(v3.uw,r3) -; CHECK-NEXT: q3 = vcmp.eq(v9.w,v7.w) -; CHECK-NEXT: v22 = vmux(q1,v7,v4) -; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v8.uw) +; CHECK-NEXT: v9 = vand(v5,v9) +; CHECK-NEXT: q3 = vcmp.eq(v11.w,v20.w) +; CHECK-NEXT: v8.w = vadd(v5.w,v8.w) +; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.uw = vlsr(v10.uw,r3) -; CHECK-NEXT: v24 = vmux(q3,v7,v4) -; CHECK-NEXT: v23 = vmux(q2,v4,v7) -; CHECK-NEXT: v4 = vmux(q1,v4,v7) +; CHECK-NEXT: v21.uw = vlsr(v10.uw,r3) +; CHECK-NEXT: q2 = vcmp.eq(v9.w,v20.w) +; CHECK-NEXT: v22 = vmux(q3,v20,v2) +; CHECK-NEXT: q3 = vcmp.gt(v5.uw,v8.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3) -; CHECK-NEXT: v9.w = vadd(v3.w,v22.w) -; CHECK-NEXT: v6.w = vsub(v23.w,v6.w) -; CHECK-NEXT: v4.w = vsub(v4.w,v5.w) +; CHECK-NEXT: v9.w = vadd(v21.w,v22.w) +; CHECK-NEXT: v24 = vmux(q2,v20,v2) +; CHECK-NEXT: v23 = vmux(q1,v2,v20) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.uw = vlsr(v2.uw,r3) +; CHECK-NEXT: v12.uw = vlsr(v6.uw,r3) +; CHECK-NEXT: v2 = vmux(q3,v2,v20) ; CHECK-NEXT: v25.w = vadd(v8.w,v24.w) -; CHECK-NEXT: q3 = vcmp.eq(v12.w,v3.w) -; CHECK-NEXT: v6.w = vadd(v6.w,v27.w) +; CHECK-NEXT: v3.w = vsub(v23.w,v3.w) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v5.uw = vlsr(v5.uw,r3) +; CHECK-NEXT: v2.w = vsub(v2.w,v7.w) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v21.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 -; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v2.w,v8.w) -; CHECK-NEXT: v4.w = vadd(v4.w,v27.w) +; CHECK-NEXT: v6.uw = vlsr(v21.uw,r2) +; CHECK-NEXT: q2 = vcmp.eq(v5.w,v8.w) +; CHECK-NEXT: v2.w = vadd(v2.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v25.uw,r2) -; CHECK-NEXT: v3 = vmux(q3,v9,v3) -; CHECK-NEXT: q3 = vcmp.gt(v7.w,v0.w) +; CHECK-NEXT: v6 = vmux(q3,v9,v6) +; CHECK-NEXT: q3 = vcmp.gt(v20.w,v0.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.uw = vlsr(v8.uw,r2) -; CHECK-NEXT: v30 = vmux(q3,v13,v7) -; CHECK-NEXT: v3 = vor(v26,v3) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) +; CHECK-NEXT: v29.uw = vlsr(v8.uw,r2) +; CHECK-NEXT: v30 = vmux(q3,v13,v20) +; CHECK-NEXT: v6 = vor(v26,v6) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.w = vasl(v6.w,r3) -; CHECK-NEXT: v2 = vmux(q2,v28,v2) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w) +; CHECK-NEXT: v3.w = vasl(v3.w,r3) +; CHECK-NEXT: v5 = vmux(q2,v28,v29) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v20.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v4.w,r3) -; CHECK-NEXT: v31 = vor(v30,v2) -; CHECK-NEXT: v3 = vor(v3,v29) +; CHECK-NEXT: v2.w = vasl(v2.w,r3) +; CHECK-NEXT: v31 = vor(v30,v5) +; CHECK-NEXT: v3 = vor(v6,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vor(v31,v2) -; CHECK-NEXT: v3 = vmux(q2,v7,v3) +; CHECK-NEXT: v3 = vmux(q2,v20,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmux(q3,v7,v1) +; CHECK-NEXT: v0 = vmux(q3,v20,v1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v7.sf) +; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v20.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v7.sf) +; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v20.sf) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.hf = v3:2.qf32 @@ -1454,60 +1452,60 @@ define void @u8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vsplat(r6) -; CHECK-NEXT: v5.h = vsplat(r3) +; CHECK-NEXT: v3.h = vsplat(r6) +; CHECK-NEXT: v4.h = vsplat(r3) ; CHECK-NEXT: r5 = #64 -; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.h = vsplat(r5) ; CHECK-NEXT: r4 = #10 -; CHECK-NEXT: v4.uh = vcl0(v0.uh) +; CHECK-NEXT: v5.uh = vcl0(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uh = vcl0(v1.uh) -; CHECK-NEXT: v4.h = vadd(v4.h,v2.h) +; CHECK-NEXT: v5.h = vadd(v5.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.h = vadd(v7.h,v2.h) +; CHECK-NEXT: v7.h = vadd(v7.h,v3.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.h = vasl(v0.h,v4.h) +; CHECK-NEXT: v8.h = vasl(v0.h,v5.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v11.h = vasl(v1.h,v7.h) ; CHECK-NEXT: v10 = vand(v8,v6) -; CHECK-NEXT: v9.h = vadd(v8.h,v5.h) +; CHECK-NEXT: v9.h = vadd(v8.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v22.h = vadd(v11.h,v5.h) +; CHECK-NEXT: v22.h = vadd(v11.h,v4.h) ; CHECK-NEXT: v6 = vand(v11,v6) ; CHECK-NEXT: q0 = vcmp.gt(v8.uh,v9.uh) -; CHECK-NEXT: q1 = vcmp.eq(v10.h,v3.h) +; CHECK-NEXT: q1 = vcmp.eq(v10.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uh = vlsr(v8.uh,r2) -; CHECK-NEXT: q2 = vcmp.eq(v6.h,v3.h) +; CHECK-NEXT: q2 = vcmp.eq(v6.h,v2.h) ; CHECK-NEXT: q3 = vcmp.gt(v11.uh,v22.uh) -; CHECK-NEXT: v12 = vmux(q1,v3,v2) +; CHECK-NEXT: v12 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uh = vlsr(v9.uh,r2) -; CHECK-NEXT: v13 = vmux(q2,v3,v2) -; CHECK-NEXT: v25 = vmux(q0,v2,v3) -; CHECK-NEXT: v2 = vmux(q3,v2,v3) +; CHECK-NEXT: v13 = vmux(q2,v2,v3) +; CHECK-NEXT: v25 = vmux(q0,v3,v2) +; CHECK-NEXT: v3 = vmux(q3,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uh = vlsr(v22.uh,r2) ; CHECK-NEXT: v24.h = vadd(v9.h,v12.h) -; CHECK-NEXT: v2.h = vadd(v2.h,v5.h) -; CHECK-NEXT: v12.h = vadd(v25.h,v5.h) +; CHECK-NEXT: v3.h = vadd(v3.h,v4.h) +; CHECK-NEXT: v12.h = vadd(v25.h,v4.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v23.uh = vlsr(v11.uh,r2) ; CHECK-NEXT: v13.h = vadd(v8.h,v13.h) -; CHECK-NEXT: v4.h = vsub(v12.h,v4.h) -; CHECK-NEXT: v2.h = vsub(v2.h,v7.h) +; CHECK-NEXT: v5.h = vsub(v12.h,v5.h) +; CHECK-NEXT: v3.h = vsub(v3.h,v7.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v14.uh = vlsr(v9.uh,r6) @@ -1519,28 +1517,28 @@ define void @u8f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v27.uh = vlsr(v13.uh,r6) -; CHECK-NEXT: v5 = vmux(q2,v26,v14) -; CHECK-NEXT: q2 = vcmp.eq(v1.h,v3.h) +; CHECK-NEXT: v4 = vmux(q2,v26,v14) +; CHECK-NEXT: q2 = vcmp.eq(v1.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uh = vlsr(v8.uh,r6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.h = vasl(v4.h,r4) +; CHECK-NEXT: v5.h = vasl(v5.h,r4) ; CHECK-NEXT: v6 = vmux(q3,v27,v28) -; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h) +; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.h = vasl(v2.h,r4) -; CHECK-NEXT: v29 = vor(v5,v4) +; CHECK-NEXT: v3.h = vasl(v3.h,r4) +; CHECK-NEXT: v29 = vor(v4,v5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vor(v6,v2) -; CHECK-NEXT: v31 = vmux(q3,v3,v29) +; CHECK-NEXT: v3 = vor(v6,v3) +; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v3,v2) +; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -1635,6 +1633,7 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: r6 = #512 ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1643,19 +1642,16 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v15 = vsplat(r6) ; CHECK-NEXT: v6 = vsplat(r3) -; CHECK-NEXT: r6 = #512 +; CHECK-NEXT: r5 = #159 ; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13 = vsplat(r6) -; CHECK-NEXT: r5 = #159 +; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v31:30.uh = vunpack(v1.ub) -; CHECK-NEXT: v15 = vxor(v15,v15) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v16 = vsplat(r5) -; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { @@ -1670,153 +1666,153 @@ define void @u8f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8.uw = vcl0(v3.uw) -; CHECK-NEXT: v7.w = vadd(v7.w,v4.w) +; CHECK-NEXT: v11.w = vadd(v7.w,v4.w) +; CHECK-NEXT: v7 = vxor(v7,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v9.uw = vcl0(v1.uw) -; CHECK-NEXT: v8.w = vadd(v8.w,v4.w) +; CHECK-NEXT: v10.w = vadd(v8.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v10.w = vasl(v2.w,v5.w) -; CHECK-NEXT: v9.w = vadd(v9.w,v4.w) +; CHECK-NEXT: v9 = vsplat(r5) +; CHECK-NEXT: v14.w = vasl(v0.w,v11.w) +; CHECK-NEXT: v8.w = vadd(v9.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.w = vasl(v0.w,v7.w) -; CHECK-NEXT: v19 = vand(v10,v13) -; CHECK-NEXT: v18.w = vadd(v10.w,v6.w) +; CHECK-NEXT: v12.w = vasl(v2.w,v5.w) +; CHECK-NEXT: v24 = vand(v14,v15) +; CHECK-NEXT: v20.w = vadd(v14.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.w = vasl(v3.w,v8.w) -; CHECK-NEXT: v24 = vand(v12,v13) -; CHECK-NEXT: q2 = vcmp.eq(v19.w,v15.w) -; CHECK-NEXT: v20.w = vadd(v12.w,v6.w) +; CHECK-NEXT: v13.w = vasl(v3.w,v10.w) +; CHECK-NEXT: v19 = vand(v12,v15) +; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w) +; CHECK-NEXT: v18.w = vadd(v12.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.w = vasl(v1.w,v9.w) -; CHECK-NEXT: v23 = vand(v11,v13) -; CHECK-NEXT: v22.w = vadd(v11.w,v6.w) -; CHECK-NEXT: q3 = vcmp.eq(v24.w,v15.w) +; CHECK-NEXT: v16.w = vasl(v1.w,v8.w) +; CHECK-NEXT: v23 = vand(v13,v15) +; CHECK-NEXT: v22.w = vadd(v13.w,v6.w) +; CHECK-NEXT: q0 = vcmp.gt(v14.uw,v20.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2) -; CHECK-NEXT: v6.w = vadd(v14.w,v6.w) -; CHECK-NEXT: v13 = vand(v14,v13) -; CHECK-NEXT: v31 = vmux(q3,v15,v4) +; CHECK-NEXT: v6.w = vadd(v16.w,v6.w) +; CHECK-NEXT: v15 = vand(v16,v15) +; CHECK-NEXT: v30 = vmux(q3,v7,v4) +; CHECK-NEXT: q2 = vcmp.eq(v19.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v12.uw,r2) -; CHECK-NEXT: q3 = vcmp.eq(v13.w,v15.w) -; CHECK-NEXT: v28 = vmux(q2,v15,v4) -; CHECK-NEXT: q0 = vcmp.gt(v12.uw,v20.uw) +; CHECK-NEXT: v21.uw = vlsr(v14.uw,r2) +; CHECK-NEXT: q3 = vcmp.eq(v15.w,v7.w) +; CHECK-NEXT: v28 = vmux(q0,v4,v7) +; CHECK-NEXT: q1 = vcmp.eq(v23.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vlsr(v20.uw,r2) -; CHECK-NEXT: q1 = vcmp.eq(v23.w,v15.w) -; CHECK-NEXT: v26 = vmux(q3,v15,v4) -; CHECK-NEXT: v23.w = vadd(v19.w,v28.w) +; CHECK-NEXT: v14.uw = vlsr(v20.uw,r2) +; CHECK-NEXT: v26 = vmux(q3,v7,v4) +; CHECK-NEXT: v11.w = vsub(v28.w,v11.w) +; CHECK-NEXT: q3 = vcmp.gt(v13.uw,v22.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v13.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v20.w = vadd(v12.w,v31.w) -; CHECK-NEXT: q3 = vcmp.gt(v11.uw,v22.uw) -; CHECK-NEXT: v31 = vmux(q1,v15,v4) +; CHECK-NEXT: v15.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v20.w = vadd(v14.w,v30.w) +; CHECK-NEXT: v30 = vmux(q1,v7,v4) +; CHECK-NEXT: v31 = vmux(q2,v7,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2) -; CHECK-NEXT: v30.w = vadd(v13.w,v26.w) -; CHECK-NEXT: q1 = vcmp.gt(v10.uw,v18.uw) -; CHECK-NEXT: v29 = vmux(q0,v4,v15) +; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2) +; CHECK-NEXT: v29.w = vadd(v15.w,v26.w) +; CHECK-NEXT: q1 = vcmp.gt(v12.uw,v18.uw) +; CHECK-NEXT: v11.w = vadd(v11.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) -; CHECK-NEXT: q0 = vcmp.eq(v21.w,v12.w) -; CHECK-NEXT: v22.w = vadd(v28.w,v31.w) -; CHECK-NEXT: v23 = vmux(q3,v4,v15) +; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2) +; CHECK-NEXT: v23.w = vadd(v19.w,v31.w) +; CHECK-NEXT: v22 = vmux(q3,v4,v7) +; CHECK-NEXT: q3 = vcmp.gt(v16.uw,v6.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.uw = vlsr(v30.uw,r0) -; CHECK-NEXT: v31 = vmux(q1,v4,v15) -; CHECK-NEXT: q3 = vcmp.gt(v14.uw,v6.uw) -; CHECK-NEXT: v30.w = vsub(v23.w,v8.w) +; CHECK-NEXT: v24.uw = vlsr(v29.uw,r0) +; CHECK-NEXT: v31.w = vadd(v28.w,v30.w) +; CHECK-NEXT: v30 = vmux(q1,v4,v7) +; CHECK-NEXT: v4 = vmux(q3,v4,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vsub(v31.w,v5.w) -; CHECK-NEXT: v4 = vmux(q3,v4,v15) -; CHECK-NEXT: v7.w = vsub(v29.w,v7.w) -; CHECK-NEXT: v6.w = vadd(v30.w,v16.w) +; CHECK-NEXT: v17.uw = vlsr(v12.uw,r2) +; CHECK-NEXT: v5.w = vsub(v30.w,v5.w) +; CHECK-NEXT: v29.w = vsub(v22.w,v10.w) +; CHECK-NEXT: v4.w = vsub(v4.w,v8.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v17.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: v4.w = vsub(v4.w,v9.w) -; CHECK-NEXT: v5.w = vadd(v5.w,v16.w) -; CHECK-NEXT: v7.w = vadd(v7.w,v16.w) +; CHECK-NEXT: v13.uw = vlsr(v13.uw,r2) +; CHECK-NEXT: v6.w = vadd(v29.w,v9.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v9.w) +; CHECK-NEXT: q0 = vcmp.eq(v21.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vlsr(v11.uw,r2) +; CHECK-NEXT: v25.uw = vlsr(v16.uw,r2) ; CHECK-NEXT: q2 = vcmp.eq(v17.w,v19.w) -; CHECK-NEXT: v4.w = vadd(v4.w,v16.w) -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: v25.uw = vlsr(v14.uw,r2) -; CHECK-NEXT: q3 = vcmp.eq(v11.w,v28.w) +; CHECK-NEXT: q3 = vcmp.eq(v13.w,v28.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uw = vlsr(v12.uw,r0) -; CHECK-NEXT: q1 = vcmp.eq(v25.w,v13.w) +; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0) +; CHECK-NEXT: q1 = vcmp.eq(v25.w,v15.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v29.uw = vlsr(v19.uw,r0) +; CHECK-NEXT: v23.uw = vlsr(v19.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v12.uw = vlsr(v22.uw,r0) -; CHECK-NEXT: v23 = vmux(q2,v21,v29) -; CHECK-NEXT: q2 = vcmp.eq(v3.w,v15.w) +; CHECK-NEXT: v31.uw = vlsr(v31.uw,r0) +; CHECK-NEXT: v23 = vmux(q2,v21,v23) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v14.uw = vlsr(v28.uw,r0) +; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0) -; CHECK-NEXT: v8 = vmux(q3,v12,v14) -; CHECK-NEXT: q3 = vcmp.eq(v2.w,v15.w) +; CHECK-NEXT: v26.uw = vlsr(v15.uw,r0) +; CHECK-NEXT: v8 = vmux(q3,v31,v16) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vasl(v6.w,r4) -; CHECK-NEXT: v20 = vmux(q0,v20,v27) +; CHECK-NEXT: v22 = vmux(q1,v24,v26) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.uw = vlsr(v13.uw,r0) +; CHECK-NEXT: v5.w = vasl(v5.w,r4) ; CHECK-NEXT: v6 = vor(v8,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vasl(v5.w,r4) -; CHECK-NEXT: v22 = vmux(q1,v24,v26) -; CHECK-NEXT: v26 = vmux(q2,v15,v6) +; CHECK-NEXT: v27.uw = vlsr(v14.uw,r0) +; CHECK-NEXT: v25 = vor(v23,v5) +; CHECK-NEXT: v26 = vmux(q2,v7,v6) ; CHECK-NEXT: vmem(r1+#1) = v26.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v7.w,r4) -; CHECK-NEXT: v25 = vor(v23,v5) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v15.w) +; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0) +; CHECK-NEXT: v28 = vmux(q3,v7,v25) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w) +; CHECK-NEXT: vmem(r1+#0) = v28.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v24.w = vasl(v4.w,r4) -; CHECK-NEXT: v28 = vmux(q3,v15,v25) -; CHECK-NEXT: v29 = vor(v20,v7) -; CHECK-NEXT: vmem(r1+#0) = v28.new +; CHECK-NEXT: v11.w = vasl(v11.w,r4) +; CHECK-NEXT: v20 = vmux(q0,v20,v27) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27 = vor(v22,v24) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v15.w) +; CHECK-NEXT: v24.w = vasl(v4.w,r4) +; CHECK-NEXT: v29 = vor(v20,v11) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v15,v27) -; CHECK-NEXT: v31 = vmux(q3,v15,v29) -; CHECK-NEXT: vmem(r1+#3) = v30.new +; CHECK-NEXT: v27 = vor(v22,v24) +; CHECK-NEXT: v31 = vmux(q3,v7,v29) +; CHECK-NEXT: vmem(r1+#2) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v30 = vmux(q2,v7,v27) ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: vmem(r1+#2) = v31 +; CHECK-NEXT: vmem(r1+#3) = v30.new ; CHECK-NEXT: } %v0 = load <128 x i8>, ptr %a0, align 128 %v1 = uitofp <128 x i8> %v0 to <128 x float> @@ -1830,112 +1826,110 @@ define void @u8f32_1(ptr %a0, ptr %a1) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: v0 = vmem(r0+#0) -; CHECK-NEXT: } -; CHECK-NEXT: { ; CHECK-NEXT: r7 = #1 ; CHECK-NEXT: r6 = #512 -; CHECK-NEXT: v1:0.uh = vunpack(v0.ub) +; CHECK-NEXT: v3:2.uh = vunpack(v0.ub) +; CHECK-NEXT: v0.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v1 = vsplat(r7) ; CHECK-NEXT: v8 = vsplat(r6) ; CHECK-NEXT: r3:2 = combine(##255,#8) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r5 = #159 -; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) -; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: v3:2.uw = vunpack(v2.uh) +; CHECK-NEXT: v21 = vxor(v21,v21) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v13 = vsplat(r5) ; CHECK-NEXT: r4 = #23 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.uw = vcl0(v0.uw) +; CHECK-NEXT: v4.uw = vcl0(v2.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vcl0(v1.uw) -; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: v5.uw = vcl0(v3.uw) +; CHECK-NEXT: v4.w = vadd(v4.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) +; CHECK-NEXT: v7.w = vasl(v2.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v9.w = vasl(v1.w,v5.w) +; CHECK-NEXT: v9.w = vasl(v3.w,v5.w) ; CHECK-NEXT: v11 = vand(v7,v8) ; CHECK-NEXT: v10.w = vadd(v7.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) -; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w) +; CHECK-NEXT: q1 = vcmp.eq(v11.w,v21.w) ; CHECK-NEXT: v8 = vand(v9,v8) -; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw) +; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: v21 = vmux(q0,v3,v2) -; CHECK-NEXT: q3 = vcmp.eq(v8.w,v3.w) -; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v6.uw) +; CHECK-NEXT: v22.uw = vlsr(v10.uw,r2) +; CHECK-NEXT: v24 = vmux(q1,v21,v1) +; CHECK-NEXT: q3 = vcmp.eq(v8.w,v21.w) +; CHECK-NEXT: q1 = vcmp.gt(v9.uw,v6.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v20.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v22 = vmux(q1,v2,v3) -; CHECK-NEXT: v24 = vmux(q3,v3,v2) -; CHECK-NEXT: v2 = vmux(q0,v2,v3) +; CHECK-NEXT: v23.uw = vlsr(v6.uw,r2) +; CHECK-NEXT: v25 = vmux(q0,v1,v21) +; CHECK-NEXT: v27 = vmux(q3,v21,v1) +; CHECK-NEXT: v1 = vmux(q1,v1,v21) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.w = vsub(v22.w,v4.w) -; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) -; CHECK-NEXT: v10.w = vadd(v19.w,v21.w) -; CHECK-NEXT: v25.w = vadd(v20.w,v24.w) +; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) +; CHECK-NEXT: v1.w = vsub(v1.w,v5.w) +; CHECK-NEXT: v10.w = vadd(v22.w,v24.w) +; CHECK-NEXT: v28.w = vadd(v23.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v4.w = vadd(v4.w,v13.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v13.w) +; CHECK-NEXT: v1.w = vadd(v1.w,v13.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v23.uw = vlsr(v9.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w) +; CHECK-NEXT: v26.uw = vlsr(v9.uw,r2) +; CHECK-NEXT: q2 = vcmp.eq(v12.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v11.uw = vlsr(v19.uw,r7) -; CHECK-NEXT: q3 = vcmp.eq(v23.w,v20.w) +; CHECK-NEXT: v11.uw = vlsr(v22.uw,r7) +; CHECK-NEXT: q3 = vcmp.eq(v26.w,v23.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) +; CHECK-NEXT: v30.uw = vlsr(v10.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v26.uw = vlsr(v20.uw,r7) -; CHECK-NEXT: v5 = vmux(q2,v27,v11) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) +; CHECK-NEXT: v29.uw = vlsr(v23.uw,r7) +; CHECK-NEXT: v5 = vmux(q2,v30,v11) +; CHECK-NEXT: q2 = vcmp.eq(v3.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v6.uw = vlsr(v25.uw,r7) +; CHECK-NEXT: v6.uw = vlsr(v28.uw,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r4) -; CHECK-NEXT: v6 = vmux(q3,v6,v26) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: v6 = vmux(q3,v6,v29) +; CHECK-NEXT: q3 = vcmp.eq(v2.w,v21.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,r4) -; CHECK-NEXT: v29 = vor(v5,v4) +; CHECK-NEXT: v1.w = vasl(v1.w,r4) +; CHECK-NEXT: v31 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28 = vor(v6,v2) -; CHECK-NEXT: v31 = vmux(q3,v3,v29) -; CHECK-NEXT: vmem(r1+#0) = v31.new +; CHECK-NEXT: v1 = vor(v6,v1) +; CHECK-NEXT: v0 = vmux(q3,v21,v31) +; CHECK-NEXT: vmem(r1+#0) = v0.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v3,v28) +; CHECK-NEXT: v1 = vmux(q2,v21,v1) ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: vmem(r1+#1) = v30.new +; CHECK-NEXT: vmem(r1+#1) = v1.new ; CHECK-NEXT: } %v0 = load <64 x i8>, ptr %a0, align 128 %v1 = uitofp <64 x i8> %v0 to <64 x float> @@ -2188,10 +2182,10 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v1:0.uw = vunpack(v0.uh) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vsplat(r7) +; CHECK-NEXT: v3 = vsplat(r7) ; CHECK-NEXT: v6 = vsplat(r3) ; CHECK-NEXT: r6 = #512 -; CHECK-NEXT: v3 = vxor(v3,v3) +; CHECK-NEXT: v2 = vxor(v2,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v8 = vsplat(r6) @@ -2202,10 +2196,10 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v14 = vsplat(r5) ; CHECK-NEXT: v5.uw = vcl0(v1.uw) -; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.w = vadd(v5.w,v2.w) +; CHECK-NEXT: v5.w = vadd(v5.w,v3.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.w = vasl(v0.w,v4.w) @@ -2218,31 +2212,31 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v9.w,v6.w) ; CHECK-NEXT: v8 = vand(v9,v8) -; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w) -; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw) +; CHECK-NEXT: q1 = vcmp.eq(v11.w,v2.w) +; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2) -; CHECK-NEXT: q2 = vcmp.eq(v8.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v8.w,v2.w) ; CHECK-NEXT: q3 = vcmp.gt(v9.uw,v6.uw) -; CHECK-NEXT: v20 = vmux(q0,v3,v2) +; CHECK-NEXT: v20 = vmux(q1,v2,v3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v21.uw = vlsr(v6.uw,r2) -; CHECK-NEXT: v22 = vmux(q2,v3,v2) -; CHECK-NEXT: v25 = vmux(q1,v2,v3) -; CHECK-NEXT: v2 = vmux(q3,v2,v3) +; CHECK-NEXT: v22 = vmux(q2,v2,v3) +; CHECK-NEXT: v25 = vmux(q0,v3,v2) +; CHECK-NEXT: v3 = vmux(q3,v3,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vsub(v25.w,v4.w) -; CHECK-NEXT: v2.w = vsub(v2.w,v5.w) +; CHECK-NEXT: v3.w = vsub(v3.w,v5.w) ; CHECK-NEXT: v23.w = vadd(v19.w,v20.w) ; CHECK-NEXT: v10.w = vadd(v21.w,v22.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: v4.w = vadd(v4.w,v14.w) -; CHECK-NEXT: v2.w = vadd(v2.w,v14.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v14.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v24.uw = vlsr(v9.uw,r2) @@ -2258,7 +2252,7 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7) ; CHECK-NEXT: v5 = vmux(q2,v26,v13) -; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v28.uw = vlsr(v21.uw,r7) @@ -2266,19 +2260,19 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: v4.w = vasl(v4.w,r4) ; CHECK-NEXT: v6 = vmux(q3,v27,v28) -; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w) +; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2.w = vasl(v2.w,r4) +; CHECK-NEXT: v3.w = vasl(v3.w,r4) ; CHECK-NEXT: v29 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v2 = vor(v6,v2) -; CHECK-NEXT: v31 = vmux(q3,v3,v29) +; CHECK-NEXT: v3 = vor(v6,v3) +; CHECK-NEXT: v31 = vmux(q3,v2,v29) ; CHECK-NEXT: vmem(r1+#0) = v31.new ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vmux(q2,v3,v2) +; CHECK-NEXT: v30 = vmux(q2,v2,v3) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: vmem(r1+#1) = v30.new ; CHECK-NEXT: } @@ -2375,20 +2369,20 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#8,#1) ; CHECK-NEXT: r6 = #255 -; CHECK-NEXT: v1.uw = vcl0(v0.uw) +; CHECK-NEXT: v3.uw = vcl0(v0.uw) ; CHECK-NEXT: v0.cur = vmem(r0+#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4 = vsplat(r2) +; CHECK-NEXT: v2 = vsplat(r2) ; CHECK-NEXT: r4 = #512 -; CHECK-NEXT: v3.uw = vcl0(v2.uw) -; CHECK-NEXT: v2.cur = vmem(r0+#0) +; CHECK-NEXT: v4.uw = vcl0(v1.uw) +; CHECK-NEXT: v1.cur = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7 = vsplat(r4) ; CHECK-NEXT: v6 = vsplat(r6) -; CHECK-NEXT: v3.w = vadd(v3.w,v4.w) -; CHECK-NEXT: v1.w = vadd(v1.w,v4.w) +; CHECK-NEXT: v4.w = vadd(v4.w,v2.w) +; CHECK-NEXT: v3.w = vadd(v3.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r4 = #159 @@ -2396,57 +2390,57 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v10 = vsplat(r4) -; CHECK-NEXT: v5.w = vasl(v2.w,v3.w) +; CHECK-NEXT: v5.w = vasl(v1.w,v4.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v8.w = vasl(v0.w,v1.w) +; CHECK-NEXT: v8.w = vasl(v0.w,v3.w) ; CHECK-NEXT: v11.w = vadd(v5.w,v6.w) ; CHECK-NEXT: v13 = vand(v5,v7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.w = vadd(v8.w,v6.w) ; CHECK-NEXT: v7 = vand(v8,v7) -; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v11.uw) -; CHECK-NEXT: q1 = vcmp.eq(v13.w,v9.w) +; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v11.uw) +; CHECK-NEXT: q2 = vcmp.eq(v13.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v28.uw = vlsr(v11.uw,r3) +; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3) ; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw) -; CHECK-NEXT: q2 = vcmp.eq(v7.w,v9.w) -; CHECK-NEXT: v30 = vmux(q0,v4,v9) +; CHECK-NEXT: q0 = vcmp.eq(v7.w,v9.w) +; CHECK-NEXT: v28 = vmux(q2,v9,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3) -; CHECK-NEXT: v29 = vmux(q1,v9,v4) -; CHECK-NEXT: v31 = vmux(q3,v4,v9) -; CHECK-NEXT: v4 = vmux(q2,v9,v4) +; CHECK-NEXT: v29 = vmux(q1,v2,v9) +; CHECK-NEXT: v30 = vmux(q3,v2,v9) +; CHECK-NEXT: v2 = vmux(q0,v9,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { +; CHECK-NEXT: v4.w = vsub(v29.w,v4.w) +; CHECK-NEXT: v7.w = vadd(v27.w,v28.w) ; CHECK-NEXT: v3.w = vsub(v30.w,v3.w) -; CHECK-NEXT: v7.w = vadd(v28.w,v29.w) -; CHECK-NEXT: v1.w = vsub(v31.w,v1.w) -; CHECK-NEXT: v4.w = vadd(v6.w,v4.w) +; CHECK-NEXT: v2.w = vadd(v6.w,v2.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3) +; CHECK-NEXT: v4.w = vadd(v4.w,v10.w) ; CHECK-NEXT: v3.w = vadd(v3.w,v10.w) -; CHECK-NEXT: v1.w = vadd(v1.w,v10.w) -; CHECK-NEXT: q2 = vcmp.eq(v2.w,v9.w) +; CHECK-NEXT: q2 = vcmp.eq(v1.w,v9.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #23 ; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3) -; CHECK-NEXT: q3 = vcmp.eq(v12.w,v28.w) +; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v5.uw = vlsr(v28.uw,r2) +; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2) ; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v4.uw = vlsr(v4.uw,r2) +; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2) ; CHECK-NEXT: v5 = vmux(q3,v7,v5) ; CHECK-NEXT: q3 = vcmp.eq(v0.w,v9.w) ; CHECK-NEXT: } @@ -2454,16 +2448,16 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 { ; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v3.w = vasl(v3.w,r3) -; CHECK-NEXT: v2 = vmux(q1,v4,v6) +; CHECK-NEXT: v4.w = vasl(v4.w,r3) +; CHECK-NEXT: v31 = vmux(q1,v2,v6) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1.w = vasl(v1.w,r3) -; CHECK-NEXT: v3 = vor(v5,v3) +; CHECK-NEXT: v2.w = vasl(v3.w,r3) +; CHECK-NEXT: v4 = vor(v5,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vor(v2,v1) -; CHECK-NEXT: v3 = vmux(q2,v9,v3) +; CHECK-NEXT: v1 = vor(v31,v2) +; CHECK-NEXT: v3 = vmux(q2,v9,v4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vmux(q3,v9,v1) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll index 2384ca4f95ec4..6fa0585843f46 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-truncate.ll @@ -14,15 +14,15 @@ define void @fred(<16 x i32> %a0, <16 x i32> %a1) #0 { ; CHECK-NEXT: { ; CHECK-NEXT: r1:0 = combine(#-1,#32) ; CHECK-NEXT: v2 = vxor(v2,v2) -; CHECK-NEXT: q0 = vcmp.eq(v0.w,v1.w) +; CHECK-NEXT: q1 = vcmp.eq(v0.w,v1.w) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7 = ##g0 -; CHECK-NEXT: q1 = vsetq(r0) -; CHECK-NEXT: v0 = vmux(q0,v0,v2) +; CHECK-NEXT: q0 = vsetq(r0) +; CHECK-NEXT: v0 = vmux(q1,v0,v2) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v30 = vand(q1,r1) +; CHECK-NEXT: v30 = vand(q0,r1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0.h = vpacke(v0.w,v0.w) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll index 7a79c7f981ae9..c18672ba0a833 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/vmpy-parts.ll @@ -321,58 +321,58 @@ define <64 x i32> @f10(<32 x i32> %a0, <32 x i32> %a1) #0 { ; V60-NEXT: r0 = ##33686018 ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v3:2.uw = vmpy(v0.uh,v1.uh) +; V60-NEXT: v3:2 = vcombine(v0,v1) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: r2 = #16 +; V60-NEXT: v1:0.uw = vmpy(v0.uh,v1.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v4 = vxor(v4,v4) +; V60-NEXT: r2 = #16 ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: v5 = vsplat(r0) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: q1 = vcmp.gt(v4.w,v0.w) +; V60-NEXT: v4 = vxor(v4,v4) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: q0 = vcmp.gt(v4.w,v1.w) +; V60-NEXT: v6.uw = vlsr(v0.uw,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v6.uw = vlsr(v2.uw,r2) +; V60-NEXT: q1 = vcmp.gt(v4.w,v3.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v30 = vmux(q1,v1,v4) +; V60-NEXT: q0 = vcmp.gt(v4.w,v2.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v5 = vdelta(v1,v5) +; V60-NEXT: v5 = vdelta(v2,v5) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: if (q0) v30.w += v0.w +; V60-NEXT: v2 = vmux(q1,v2,v4) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9:8.uw = vmpy(v0.uh,v5.uh) +; V60-NEXT: if (q0) v2.w += v3.w ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9:8.w = vadd(v9.uh,v8.uh) +; V60-NEXT: v9:8.uw = vmpy(v3.uh,v5.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v29.w = vadd(v8.w,v6.w) +; V60-NEXT: v9:8.w = vadd(v9.uh,v8.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v2.w += vasl(v8.w,r2) +; V60-NEXT: v31.w = vadd(v8.w,v6.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v9.w += vasr(v29.w,r2) +; V60-NEXT: v0.w += vasl(v8.w,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v31.w = vadd(v3.w,v9.w) +; V60-NEXT: v9.w += vasr(v31.w,r2) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v3.w = vsub(v31.w,v30.w) +; V60-NEXT: v1.w = vadd(v1.w,v9.w) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: v1:0 = vcombine(v3,v2) +; V60-NEXT: v1.w = vsub(v1.w,v2.w) ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: jumpr r31 diff --git a/llvm/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll b/llvm/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll index be2b7b4d60107..00e9cf25a7044 100644 --- a/llvm/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll +++ b/llvm/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll @@ -16,7 +16,6 @@ entry: %cmp199 = icmp eq i16 %call197, 0 br i1 %cmp199, label %if.then200, label %if.else201 -; CHECK: = add ; CHECK-DAG: [[R4:r[0-9]+]] = add ; CHECK-DAG: p0 = cmp.eq(r0,#0) ; CHECK: if (!p0) [[R3:r[0-9]+]] = add(r{{[0-9]+}},#3) diff --git a/llvm/test/CodeGen/Hexagon/ntstbit.ll b/llvm/test/CodeGen/Hexagon/ntstbit.ll index afd71c217cefb..00a2dcfcf2f47 100644 --- a/llvm/test/CodeGen/Hexagon/ntstbit.ll +++ b/llvm/test/CodeGen/Hexagon/ntstbit.ll @@ -7,7 +7,7 @@ define i32 @f0(i32 %a0, i32 %a1, i32 %a2) #0 { ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { ; CHECK-NEXT: p0 = !tstbit(r1,r2) -; CHECK-NEXT: r17:16 = combine(r0,r1) +; CHECK-NEXT: r17:16 = combine(r1,r0) ; CHECK-NEXT: memd(r29+#-16) = r17:16 ; CHECK-NEXT: allocframe(#8) ; CHECK-NEXT: } // 8-byte Folded Spill @@ -28,8 +28,8 @@ define i32 @f0(i32 %a0, i32 %a1, i32 %a2) #0 { ; CHECK-NEXT: .LBB0_3: // %b3 ; CHECK-NEXT: { ; CHECK-NEXT: call f3 -; CHECK-NEXT: r1 = add(r16,#2) -; CHECK-NEXT: r0 = r17 +; CHECK-NEXT: r1 = add(r17,#2) +; CHECK-NEXT: r0 = r16 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r0 = #0 diff --git a/llvm/test/CodeGen/Hexagon/signext-inreg.ll b/llvm/test/CodeGen/Hexagon/signext-inreg.ll index cd9d783586957..fe74fa0f9a0ee 100644 --- a/llvm/test/CodeGen/Hexagon/signext-inreg.ll +++ b/llvm/test/CodeGen/Hexagon/signext-inreg.ll @@ -141,34 +141,34 @@ define <64 x i16> @test3(<64 x i16> %m) { ; CHECK-NEXT: r11:10 = memd(r29+#88) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vaslh(r9:8,#8) +; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) ; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) -; CHECK-NEXT: r9:8 = memd(r29+#80) +; CHECK-NEXT: r13:12 = memd(r29+#80) ; CHECK-NEXT: r7:6 = memd(r29+#104) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r15:14 = vaslh(r7:6,#8) -; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) +; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) ; CHECK-NEXT: r7:6 = memd(r29+#72) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r15:14 = vasrh(r15:14,#8) -; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) +; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) -; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) +; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) ; CHECK-NEXT: r15:14 = memd(r29+#64) ; CHECK-NEXT: memd(r0+#120) = r15:14 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7:6 = vaslh(r7:6,#8) ; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) -; CHECK-NEXT: r13:12 = memd(r29+#56) -; CHECK-NEXT: memd(r0+#112) = r13:12 +; CHECK-NEXT: r9:8 = memd(r29+#56) +; CHECK-NEXT: memd(r0+#112) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) ; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) ; CHECK-NEXT: r11:10 = memd(r29+#48) ; CHECK-NEXT: memd(r0+#104) = r11:10 @@ -176,29 +176,29 @@ define <64 x i16> @test3(<64 x i16> %m) { ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) ; CHECK-NEXT: r15:14 = vasrh(r15:14,#8) -; CHECK-NEXT: r9:8 = memd(r29+#40) -; CHECK-NEXT: memd(r0+#96) = r9:8 +; CHECK-NEXT: r13:12 = memd(r29+#40) +; CHECK-NEXT: memd(r0+#96) = r13:12 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) -; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) +; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) ; CHECK-NEXT: r7:6 = memd(r29+#32) ; CHECK-NEXT: memd(r0+#88) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) -; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) +; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) ; CHECK-NEXT: r15:14 = memd(r29+#0) ; CHECK-NEXT: memd(r0+#80) = r15:14 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r7:6 = vaslh(r7:6,#8) ; CHECK-NEXT: r15:14 = vaslh(r15:14,#8) -; CHECK-NEXT: r13:12 = memd(r29+#16) -; CHECK-NEXT: memd(r0+#72) = r13:12 +; CHECK-NEXT: r9:8 = memd(r29+#16) +; CHECK-NEXT: memd(r0+#72) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) ; CHECK-NEXT: r7:6 = vasrh(r7:6,#8) ; CHECK-NEXT: r11:10 = memd(r29+#24) ; CHECK-NEXT: memd(r0+#64) = r11:10 @@ -206,29 +206,29 @@ define <64 x i16> @test3(<64 x i16> %m) { ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vaslh(r11:10,#8) ; CHECK-NEXT: r3:2 = vasrh(r3:2,#8) -; CHECK-NEXT: r9:8 = memd(r29+#8) -; CHECK-NEXT: memd(r0+#56) = r9:8 +; CHECK-NEXT: r13:12 = memd(r29+#8) +; CHECK-NEXT: memd(r0+#56) = r13:12 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r9:8 = vaslh(r9:8,#8) -; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) +; CHECK-NEXT: r13:12 = vaslh(r13:12,#8) +; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) ; CHECK-NEXT: memd(r0+#48) = r7:6 ; CHECK-NEXT: memd(r0+#0) = r3:2 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r11:10 = vasrh(r11:10,#8) ; CHECK-NEXT: r7:6 = vasrh(r15:14,#8) -; CHECK-NEXT: memd(r0+#32) = r13:12 +; CHECK-NEXT: memd(r0+#32) = r9:8 ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r9:8 = vasrh(r9:8,#8) +; CHECK-NEXT: r13:12 = vasrh(r13:12,#8) ; CHECK-NEXT: r5:4 = vasrh(r5:4,#8) ; CHECK-NEXT: memd(r0+#40) = r11:10 ; CHECK-NEXT: memd(r0+#16) = r7:6 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: memd(r0+#24) = r9:8 +; CHECK-NEXT: memd(r0+#24) = r13:12 ; CHECK-NEXT: memd(r0+#8) = r5:4 ; CHECK-NEXT: } ; diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll index 91b9ff36d29ab..1562f1872ceb7 100644 --- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; This version of the conv3x3 test has both loops. This test checks that the -; inner loop has 13 packets. +; inner loop has 14 packets. ; CHECK: loop0(.LBB0_[[LOOP:.]], ; CHECK: .LBB0_[[LOOP]]: @@ -17,6 +17,7 @@ ; CHECK: } ; CHECK: } ; CHECK: } +; CHECK: } ; CHECK-NOT: } ; CHECK: }{{[ \t]*}}:endloop0 diff --git a/llvm/test/CodeGen/Hexagon/swp-stages4.ll b/llvm/test/CodeGen/Hexagon/swp-stages4.ll index ea88b79de9369..5377dc4d13abd 100644 --- a/llvm/test/CodeGen/Hexagon/swp-stages4.ll +++ b/llvm/test/CodeGen/Hexagon/swp-stages4.ll @@ -6,6 +6,7 @@ ; CHECK: = and ; CHECK: = and ; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1) +; CHECK: = and ; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255) ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255) ; CHECK: loop0(.LBB0_[[LOOP:.]], diff --git a/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-log-noml.txt b/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-log-noml.txt index 6289619de7c55..ca0e66cff82c4 100644 --- a/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-log-noml.txt +++ b/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-log-noml.txt @@ -16,8 +16,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7265065908432007,0.0, start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.3333333432674408,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2724486181679993e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23831403255462646,0.07943800836801529,0.07943800836801529,0.07943800836801529,0.9912577867507935,0.07069581001996994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.962109386920929,0.7933593988418579,0.790234386920929,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.737500011920929 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.01772311143577099,0.014231426641345024,0.014288840815424919,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4279724359512329 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.7940031290054321,0.7908878326416016,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7352024912834167 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.01417447254061699,0.014231426641345024,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4279724359512329 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -40,8 +40,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.0,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2724486181679993e-10,0.0,0.9760092496871948,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2404157966375351,0.08013860136270523,0.0,0.08013860136270523,1.0,0.07131929695606232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08013860136270523 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.962109386920929,0.0,0.790234386920929,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7933593988418579 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.01772311143577099,0.0,0.014288840815424919,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014231426641345024 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.0,0.7908878326416016,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7940031290054321 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.0,0.014231426641345024,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01417447254061699 max_stage: 0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -64,8 +64,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0, start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1666666716337204,0.3333333432674408,0.3333333432674408,0.3333333432674408,0.3333333432674408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2724486181679993e-10,2.2724486181679993e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.01989283785223961,0.02235277369618416,0.2813863754272461,0.02235277369618416,0.27892643213272095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.908984363079071,0.962109386920929,0.737500011920929,0.790234386920929,0.740234375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674609363079071 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.0737093985080719,0.01772311143577099,0.4279724359512329,0.014288840815424919,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4858442544937134 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9061526656150818,0.9591121673583984,0.7352024912834167,0.7908878326416016,0.7379283308982849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6725077629089355 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.0737093985080719,0.01772311143577099,0.4279724359512329,0.014231426641345024,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4858442544937134 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -88,8 +88,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0, start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.3333333432674408,0.0,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2724486181679993e-10,0.9760092496871948,0.0,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23831403255462646,0.07943800836801529,1.0,0.0,0.9912577867507935,0.07069581001996994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07943800836801529 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.962109386920929,0.737500011920929,0.0,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.790234386920929 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.01772311143577099,0.4279724359512329,0.0,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014288840815424919 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.7352024912834167,0.0,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7908878326416016 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.4279724359512329,0.0,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014231426641345024 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -112,8 +112,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0, start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2724486181679993e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.02235277369618416,0.2813863754272461,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01117638684809208 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.962109386920929,0.737500011920929,0.674609363079071,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6714843511581421 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00449750293046236 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.7352024912834167,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6693925261497498 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00449750293046236 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -136,8 +136,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0, start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2724486181679993e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.02235277369618416,0.2813863754272461,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01822916604578495 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.962109386920929,0.737500011920929,0.674609363079071,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6683593988418579 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008109557442367077 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.7352024912834167,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6662772297859192 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008109557442367077 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -160,8 +160,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688586473465,0.0, start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1666666716337204,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.3333333432674408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2724486181679993e-10,2.2724486181679993e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.01989283785223961,0.02235277369618416,0.2813863754272461,1.0,0.27892643213272095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2631579041481018 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.908984363079071,0.962109386920929,0.737500011920929,0.674609363079071,0.740234375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.665234386920929 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.0737093985080719,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07601386308670044 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9061526656150818,0.9591121673583984,0.7352024912834167,0.6725077629089355,0.7379283308982849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6631619930267334 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.0737093985080719,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07601386308670044 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -184,8 +184,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0, start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.3333333432674408,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.2724486181679993e-10 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.0,0.2813863754272461,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02235277369618416 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.737500011920929,0.674609363079071,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.962109386920929 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.0,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01772311143577099 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.7352024912834167,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9591121673583984 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.0,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01772311143577099 max_stage: 0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -208,8 +208,8 @@ hint_weights_by_max: 1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853, start_bb_freq_by_max: 0.3333333432674408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1666666716337204,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204 end_bb_freq_by_max: 0.9760092496871948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5110765099525452 hottest_bb_freq_by_max: 0.2813863754272461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.2631579041481018,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27892643213272095 -liverange_size: 0.737500011920929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.665234386920929,0.674609363079071,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4625000059604645 -use_def_density: 0.42606985569000244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050985489040613174,0.07567594200372696,0.48368439078330994,0.9955543875694275,0.07338171452283859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 +liverange_size: 0.7352024912834167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.6631619930267334,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46105918288230896 +use_def_density: 0.42606985569000244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05081497132778168,0.07567594200372696,0.48368439078330994,0.9955543875694275,0.07338171452283859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 max_stage: 1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -232,8 +232,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7265065908432007, start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1666666716337204,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.2631579041481018,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.665234386920929,0.674609363079071,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.737500011920929 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397,0.07601386308670044,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4279724359512329 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.6631619930267334,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7352024912834167 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.07601386308670044,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4279724359512329 max_stage: 0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -256,8 +256,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204,0.0,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948,0.0,0.9760092496871948,0.9760092496871948,2.2724486181679993e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2631579041481018,0.0,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.665234386920929,0.0,0.674609363079071,0.740234375,0.908984363079071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07601386308670044,0.0,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05121316388249397 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6631619930267334,0.0,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07601386308670044,0.0,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129 max_stage: 0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 min_stage: 0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 progress: 0.7777777910232544 @@ -400,8 +400,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.07419288158416748 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.7152887582778931,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675,0.6365708112716675,0.3333333432674408,0.6365708112716675,0.6365708112716675,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.2631579041481018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02235277369618416 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9478557705879211,0.5769980549812317,0.8416179418563843,0.9234892725944519,0.3196881115436554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9839181303977966 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.07978282868862152,0.7709200978279114,0.37288200855255127,0.7674928903579712,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006986536085605621 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9451456069946289,0.5747572779655457,0.8383495211601257,0.9199029207229614,0.3184466063976288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9839805960655212 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.07900823652744293,0.7709200978279114,0.37288200855255127,0.7674928903579712,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006958406884223223 max_stage: 0,0,0,0,0,0,0,0,0,4,4,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4 progress: 0.1111111119389534 @@ -424,8 +424,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.07419288158416748 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.7152887582778931,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675,0.6365708112716675,0.3333333432674408,0.6365708112716675,0.6365708112716675,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.2631579041481018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02235277369618416 -liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9478557705879211,0.5769980549812317,0.8416179418563843,0.9234892725944519,0.3196881115436554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9848927855491638 -use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.07978282868862152,0.7709200978279114,0.37288200855255127,0.7674928903579712,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006979482248425484 +liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9451456069946289,0.5747572779655457,0.8383495211601257,0.9199029207229614,0.3184466063976288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9849514365196228 +use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.07900823652744293,0.7709200978279114,0.37288200855255127,0.7674928903579712,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006951410323381424 max_stage: 0,0,0,0,0,0,0,0,0,4,4,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4 progress: 0.0833333358168602 diff --git a/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-prio-log-noml.txt b/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-prio-log-noml.txt index a39b7ace101f3..beb0c5205979c 100644 --- a/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-prio-log-noml.txt +++ b/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-prio-log-noml.txt @@ -237,7 +237,7 @@ observation: 39 li_size: 0 stage: 0 weight: 0.0 -priority: 3582.0 +priority: 3598.0 reward: 0.0 observation: 40 li_size: 0 @@ -249,7 +249,7 @@ observation: 41 li_size: 0 stage: 0 weight: 0.0 -priority: 3566.0 +priority: 3582.0 reward: 0.0 observation: 42 li_size: 0 @@ -291,7 +291,7 @@ observation: 48 li_size: 0 stage: 0 weight: 0.0 -priority: 4368.0 +priority: 4384.0 reward: 0.0 observation: 49 li_size: 0 diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-prio-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-prio-logging.ll index 9cd156d8446a5..21bb75278874a 100644 --- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-prio-logging.ll +++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-prio-logging.ll @@ -24,5 +24,5 @@ ; CHECK-NOT: nan ; CHECK-LABEL: priority: ; NOML-SAME: 2684358144.0 -; ML-SAME: 3583 +; ML-SAME: 3599 ; CHECK-LABEL: reward: diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll index c71a6908fae5c..093253bf8f691 100644 --- a/llvm/test/CodeGen/PowerPC/all-atomics.ll +++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll @@ -4688,13 +4688,13 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwsync ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: rlwinm 20, 29, 0, 0, 29 -; AIX32-NEXT: xori 24, 5, 24 -; AIX32-NEXT: slw 5, 3, 24 +; AIX32-NEXT: xori 25, 5, 24 +; AIX32-NEXT: slw 5, 3, 25 ; AIX32-NEXT: stb 3, 0(28) ; AIX32-NEXT: li 3, 255 ; AIX32-NEXT: sync -; AIX32-NEXT: slw 6, 4, 24 -; AIX32-NEXT: slw 3, 3, 24 +; AIX32-NEXT: slw 6, 4, 25 +; AIX32-NEXT: slw 3, 3, 25 ; AIX32-NEXT: and 4, 5, 3 ; AIX32-NEXT: and 5, 6, 3 ; AIX32-NEXT: L..BB3_4: # %entry @@ -4711,7 +4711,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: bne 0, L..BB3_4 ; AIX32-NEXT: L..BB3_6: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: srw 4, 6, 24 +; AIX32-NEXT: srw 4, 6, 25 ; AIX32-NEXT: lbz 3, 0(28) ; AIX32-NEXT: extsb 5, 3 ; AIX32-NEXT: lwz 3, L..C2(2) # @ss @@ -4750,12 +4750,12 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwz 3, L..C3(2) # @us ; AIX32-NEXT: rlwinm 6, 3, 3, 27, 27 ; AIX32-NEXT: rlwinm 19, 3, 0, 0, 29 -; AIX32-NEXT: xori 23, 6, 16 -; AIX32-NEXT: slw 6, 4, 23 +; AIX32-NEXT: xori 24, 6, 16 +; AIX32-NEXT: slw 6, 4, 24 ; AIX32-NEXT: li 4, 0 -; AIX32-NEXT: slw 5, 5, 23 +; AIX32-NEXT: slw 5, 5, 24 ; AIX32-NEXT: ori 4, 4, 65535 -; AIX32-NEXT: slw 4, 4, 23 +; AIX32-NEXT: slw 4, 4, 24 ; AIX32-NEXT: and 5, 5, 4 ; AIX32-NEXT: and 6, 6, 4 ; AIX32-NEXT: L..BB3_10: # %entry @@ -4771,7 +4771,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 8, 0, 19 ; AIX32-NEXT: bne 0, L..BB3_10 ; AIX32-NEXT: L..BB3_12: # %entry -; AIX32-NEXT: srw 4, 7, 23 +; AIX32-NEXT: srw 4, 7, 24 ; AIX32-NEXT: lwsync ; AIX32-NEXT: lwz 17, L..C4(2) # @si ; AIX32-NEXT: sth 4, 0(3) @@ -4810,11 +4810,11 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwz 31, L..C6(2) # @sll ; AIX32-NEXT: stw 5, 0(27) ; AIX32-NEXT: lbz 3, 0(28) -; AIX32-NEXT: li 25, 0 +; AIX32-NEXT: li 23, 0 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: li 7, 5 ; AIX32-NEXT: li 8, 5 -; AIX32-NEXT: stw 25, 56(1) +; AIX32-NEXT: stw 23, 56(1) ; AIX32-NEXT: extsb 6, 3 ; AIX32-NEXT: lbz 3, 0(29) ; AIX32-NEXT: srawi 5, 6, 31 @@ -4832,7 +4832,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: extsb 6, 4 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: srawi 5, 6, 31 -; AIX32-NEXT: stw 25, 56(1) +; AIX32-NEXT: stw 23, 56(1) ; AIX32-NEXT: stw 3, 0(31) ; AIX32-NEXT: lbz 3, 0(29) ; AIX32-NEXT: stw 3, 60(1) @@ -4870,14 +4870,14 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lbz 5, 0(28) ; AIX32-NEXT: cmpw 4, 3 ; AIX32-NEXT: li 3, 1 -; AIX32-NEXT: iseleq 4, 3, 25 -; AIX32-NEXT: slw 6, 5, 24 +; AIX32-NEXT: iseleq 4, 3, 23 +; AIX32-NEXT: slw 6, 5, 25 ; AIX32-NEXT: li 5, 255 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) -; AIX32-NEXT: slw 5, 5, 24 +; AIX32-NEXT: slw 5, 5, 25 ; AIX32-NEXT: sync -; AIX32-NEXT: slw 7, 4, 24 +; AIX32-NEXT: slw 7, 4, 25 ; AIX32-NEXT: and 6, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: L..BB3_22: # %entry @@ -4893,11 +4893,11 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 9, 0, 20 ; AIX32-NEXT: bne 0, L..BB3_22 ; AIX32-NEXT: L..BB3_24: # %entry -; AIX32-NEXT: srw 5, 8, 24 +; AIX32-NEXT: srw 5, 8, 25 ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 25 +; AIX32-NEXT: iseleq 4, 3, 23 ; AIX32-NEXT: extsb 5, 5 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) @@ -4926,16 +4926,16 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 25 +; AIX32-NEXT: iseleq 4, 3, 23 ; AIX32-NEXT: extsb 5, 5 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: sync -; AIX32-NEXT: slw 6, 5, 23 +; AIX32-NEXT: slw 6, 5, 24 ; AIX32-NEXT: li 5, 0 -; AIX32-NEXT: slw 7, 4, 23 +; AIX32-NEXT: slw 7, 4, 24 ; AIX32-NEXT: ori 5, 5, 65535 -; AIX32-NEXT: slw 5, 5, 23 +; AIX32-NEXT: slw 5, 5, 24 ; AIX32-NEXT: and 6, 6, 5 ; AIX32-NEXT: and 7, 7, 5 ; AIX32-NEXT: L..BB3_28: # %entry @@ -4951,11 +4951,11 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: stwcx. 9, 0, 19 ; AIX32-NEXT: bne 0, L..BB3_28 ; AIX32-NEXT: L..BB3_30: # %entry -; AIX32-NEXT: srw 5, 8, 23 +; AIX32-NEXT: srw 5, 8, 24 ; AIX32-NEXT: lwsync ; AIX32-NEXT: cmpw 5, 4 ; AIX32-NEXT: lbz 5, 0(28) -; AIX32-NEXT: iseleq 4, 3, 25 +; AIX32-NEXT: iseleq 4, 3, 23 ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) ; AIX32-NEXT: sync @@ -4971,7 +4971,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: bne 0, L..BB3_31 ; AIX32-NEXT: L..BB3_33: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: isel 4, 3, 25, 6 +; AIX32-NEXT: isel 4, 3, 23, 6 ; AIX32-NEXT: lbz 5, 0(28) ; AIX32-NEXT: stw 4, 0(27) ; AIX32-NEXT: lbz 4, 0(29) @@ -4988,13 +4988,13 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: bne 0, L..BB3_34 ; AIX32-NEXT: L..BB3_36: # %entry ; AIX32-NEXT: lwsync -; AIX32-NEXT: isel 3, 3, 25, 6 +; AIX32-NEXT: isel 3, 3, 23, 6 ; AIX32-NEXT: li 7, 5 ; AIX32-NEXT: li 8, 5 ; AIX32-NEXT: lbz 4, 0(28) ; AIX32-NEXT: stw 3, 0(27) ; AIX32-NEXT: lbz 3, 0(29) -; AIX32-NEXT: stw 25, 56(1) +; AIX32-NEXT: stw 23, 56(1) ; AIX32-NEXT: extsb 6, 4 ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: stw 3, 60(1) @@ -5011,7 +5011,7 @@ define dso_local void @test_compare_and_swap() local_unnamed_addr #0 { ; AIX32-NEXT: addi 4, 1, 56 ; AIX32-NEXT: stw 3, 60(1) ; AIX32-NEXT: mr 3, 30 -; AIX32-NEXT: stw 25, 56(1) +; AIX32-NEXT: stw 23, 56(1) ; AIX32-NEXT: srawi 5, 6, 31 ; AIX32-NEXT: bl .__atomic_compare_exchange_8[PR] ; AIX32-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll index 57992cff28c62..23ff5f6926916 100644 --- a/llvm/test/CodeGen/PowerPC/atomics.ll +++ b/llvm/test/CodeGen/PowerPC/atomics.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc-unknown-linux-gnu -verify-machineinstrs -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32 ; This is already checked for in Atomics-64.ll ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64 @@ -368,48 +368,48 @@ define i8 @add_i8_monotonic(ptr %mem, i8 %operand) { define i16 @xor_i16_seq_cst(ptr %mem, i16 %operand) { ; PPC32-LABEL: xor_i16_seq_cst: ; PPC32: # %bb.0: -; PPC32-NEXT: li r6, 0 -; PPC32-NEXT: rlwinm r7, r3, 3, 27, 27 -; PPC32-NEXT: rlwinm r5, r3, 0, 0, 29 -; PPC32-NEXT: ori r6, r6, 65535 -; PPC32-NEXT: xori r3, r7, 16 -; PPC32-NEXT: slw r4, r4, r3 -; PPC32-NEXT: slw r6, r6, r3 +; PPC32-NEXT: li r5, 0 +; PPC32-NEXT: rlwinm r6, r3, 3, 27, 27 +; PPC32-NEXT: ori r7, r5, 65535 +; PPC32-NEXT: xori r5, r6, 16 +; PPC32-NEXT: rlwinm r3, r3, 0, 0, 29 +; PPC32-NEXT: slw r4, r4, r5 +; PPC32-NEXT: slw r6, r7, r5 ; PPC32-NEXT: sync ; PPC32-NEXT: .LBB13_1: -; PPC32-NEXT: lwarx r7, 0, r5 +; PPC32-NEXT: lwarx r7, 0, r3 ; PPC32-NEXT: xor r8, r4, r7 ; PPC32-NEXT: andc r9, r7, r6 ; PPC32-NEXT: and r8, r8, r6 ; PPC32-NEXT: or r8, r8, r9 -; PPC32-NEXT: stwcx. r8, 0, r5 +; PPC32-NEXT: stwcx. r8, 0, r3 ; PPC32-NEXT: bne cr0, .LBB13_1 ; PPC32-NEXT: # %bb.2: -; PPC32-NEXT: srw r3, r7, r3 +; PPC32-NEXT: srw r3, r7, r5 ; PPC32-NEXT: clrlwi r3, r3, 16 ; PPC32-NEXT: lwsync ; PPC32-NEXT: blr ; ; PPC64-LABEL: xor_i16_seq_cst: ; PPC64: # %bb.0: -; PPC64-NEXT: li r6, 0 -; PPC64-NEXT: rlwinm r7, r3, 3, 27, 27 -; PPC64-NEXT: rldicr r5, r3, 0, 61 -; PPC64-NEXT: ori r6, r6, 65535 -; PPC64-NEXT: xori r3, r7, 16 -; PPC64-NEXT: slw r4, r4, r3 -; PPC64-NEXT: slw r6, r6, r3 +; PPC64-NEXT: li r5, 0 +; PPC64-NEXT: rlwinm r6, r3, 3, 27, 27 +; PPC64-NEXT: ori r7, r5, 65535 +; PPC64-NEXT: xori r5, r6, 16 +; PPC64-NEXT: rldicr r3, r3, 0, 61 +; PPC64-NEXT: slw r4, r4, r5 +; PPC64-NEXT: slw r6, r7, r5 ; PPC64-NEXT: sync ; PPC64-NEXT: .LBB13_1: -; PPC64-NEXT: lwarx r7, 0, r5 +; PPC64-NEXT: lwarx r7, 0, r3 ; PPC64-NEXT: xor r8, r4, r7 ; PPC64-NEXT: andc r9, r7, r6 ; PPC64-NEXT: and r8, r8, r6 ; PPC64-NEXT: or r8, r8, r9 -; PPC64-NEXT: stwcx. r8, 0, r5 +; PPC64-NEXT: stwcx. r8, 0, r3 ; PPC64-NEXT: bne cr0, .LBB13_1 ; PPC64-NEXT: # %bb.2: -; PPC64-NEXT: srw r3, r7, r3 +; PPC64-NEXT: srw r3, r7, r5 ; PPC64-NEXT: clrlwi r3, r3, 16 ; PPC64-NEXT: lwsync ; PPC64-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/inc-of-add.ll b/llvm/test/CodeGen/PowerPC/inc-of-add.ll index 0b06d7ed586bf..c6d6f6a17b1b5 100644 --- a/llvm/test/CodeGen/PowerPC/inc-of-add.ll +++ b/llvm/test/CodeGen/PowerPC/inc-of-add.ll @@ -66,88 +66,89 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32: # %bb.0: ; PPC32-NEXT: stwu 1, -64(1) ; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill -; PPC32-NEXT: lbz 21, 123(1) ; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: add 7, 21, 7 -; PPC32-NEXT: lbz 23, 115(1) +; PPC32-NEXT: lbz 4, 115(1) ; PPC32-NEXT: lbz 22, 119(1) -; PPC32-NEXT: lbz 21, 135(1) -; PPC32-NEXT: add 5, 23, 5 -; PPC32-NEXT: lbz 23, 127(1) -; PPC32-NEXT: add 6, 22, 6 +; PPC32-NEXT: lbz 21, 123(1) +; PPC32-NEXT: add 4, 4, 5 +; PPC32-NEXT: add 5, 22, 6 ; PPC32-NEXT: lbz 22, 131(1) -; PPC32-NEXT: add 10, 21, 10 -; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; PPC32-NEXT: add 8, 23, 8 -; PPC32-NEXT: lbz 26, 83(1) +; PPC32-NEXT: add 6, 21, 7 +; PPC32-NEXT: lbz 21, 135(1) +; PPC32-NEXT: addi 6, 6, 1 +; PPC32-NEXT: stw 20, 16(1) # 4-byte Folded Spill ; PPC32-NEXT: add 9, 22, 9 +; PPC32-NEXT: lbz 20, 127(1) +; PPC32-NEXT: add 10, 21, 10 +; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: addi 5, 5, 1 +; PPC32-NEXT: lbz 25, 83(1) +; PPC32-NEXT: add 7, 20, 8 ; PPC32-NEXT: lbz 21, 147(1) +; PPC32-NEXT: addi 7, 7, 1 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: add 26, 21, 26 -; PPC32-NEXT: lbz 25, 79(1) -; PPC32-NEXT: lbz 24, 75(1) -; PPC32-NEXT: lbz 23, 139(1) +; PPC32-NEXT: addi 4, 4, 1 +; PPC32-NEXT: lbz 24, 79(1) +; PPC32-NEXT: add 25, 21, 25 ; PPC32-NEXT: lbz 22, 143(1) -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: add 24, 23, 24 -; PPC32-NEXT: lbz 29, 95(1) -; PPC32-NEXT: add 25, 22, 25 +; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 23, 75(1) +; PPC32-NEXT: add 24, 22, 24 +; PPC32-NEXT: lbz 8, 139(1) +; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 28, 95(1) +; PPC32-NEXT: add 8, 8, 23 ; PPC32-NEXT: lbz 21, 159(1) +; PPC32-NEXT: addi 8, 8, 1 ; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; PPC32-NEXT: add 29, 21, 29 -; PPC32-NEXT: lbz 28, 91(1) -; PPC32-NEXT: lbz 27, 87(1) -; PPC32-NEXT: lbz 23, 151(1) +; PPC32-NEXT: lbz 27, 91(1) +; PPC32-NEXT: add 28, 21, 28 ; PPC32-NEXT: lbz 22, 155(1) -; PPC32-NEXT: lbz 4, 111(1) -; PPC32-NEXT: add 27, 23, 27 +; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 26, 87(1) +; PPC32-NEXT: add 27, 22, 27 +; PPC32-NEXT: lbz 23, 151(1) +; PPC32-NEXT: lbz 11, 111(1) ; PPC32-NEXT: lbz 21, 175(1) -; PPC32-NEXT: add 28, 22, 28 -; PPC32-NEXT: lbz 11, 107(1) -; PPC32-NEXT: lbz 12, 171(1) -; PPC32-NEXT: add 4, 21, 4 +; PPC32-NEXT: add 26, 23, 26 +; PPC32-NEXT: lbz 12, 107(1) +; PPC32-NEXT: lbz 0, 171(1) +; PPC32-NEXT: add 11, 21, 11 ; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: addi 4, 4, 1 -; PPC32-NEXT: lbz 0, 103(1) -; PPC32-NEXT: add 11, 12, 11 -; PPC32-NEXT: lbz 30, 99(1) -; PPC32-NEXT: lbz 23, 163(1) +; PPC32-NEXT: addi 11, 11, 1 +; PPC32-NEXT: lbz 30, 103(1) +; PPC32-NEXT: add 12, 0, 12 ; PPC32-NEXT: lbz 22, 167(1) -; PPC32-NEXT: add 30, 23, 30 -; PPC32-NEXT: stb 4, 15(3) -; PPC32-NEXT: add 23, 22, 0 -; PPC32-NEXT: addi 4, 11, 1 -; PPC32-NEXT: stb 4, 14(3) -; PPC32-NEXT: addi 4, 23, 1 -; PPC32-NEXT: stb 4, 13(3) -; PPC32-NEXT: addi 4, 30, 1 -; PPC32-NEXT: stb 4, 12(3) -; PPC32-NEXT: addi 4, 29, 1 -; PPC32-NEXT: stb 4, 11(3) -; PPC32-NEXT: addi 4, 28, 1 -; PPC32-NEXT: stb 4, 10(3) -; PPC32-NEXT: addi 4, 27, 1 -; PPC32-NEXT: stb 4, 9(3) -; PPC32-NEXT: addi 4, 26, 1 -; PPC32-NEXT: stb 4, 8(3) -; PPC32-NEXT: addi 4, 25, 1 -; PPC32-NEXT: stb 4, 7(3) -; PPC32-NEXT: addi 4, 24, 1 -; PPC32-NEXT: stb 4, 6(3) -; PPC32-NEXT: addi 4, 10, 1 -; PPC32-NEXT: stb 4, 5(3) -; PPC32-NEXT: addi 4, 9, 1 -; PPC32-NEXT: stb 4, 4(3) -; PPC32-NEXT: addi 4, 8, 1 -; PPC32-NEXT: stb 4, 3(3) -; PPC32-NEXT: addi 4, 7, 1 -; PPC32-NEXT: stb 4, 2(3) -; PPC32-NEXT: addi 4, 6, 1 -; PPC32-NEXT: stb 4, 1(3) -; PPC32-NEXT: addi 4, 5, 1 +; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 29, 99(1) +; PPC32-NEXT: add 30, 22, 30 +; PPC32-NEXT: lbz 23, 163(1) +; PPC32-NEXT: stb 11, 15(3) +; PPC32-NEXT: addi 11, 12, 1 +; PPC32-NEXT: add 29, 23, 29 +; PPC32-NEXT: stb 11, 14(3) +; PPC32-NEXT: addi 11, 30, 1 +; PPC32-NEXT: stb 11, 13(3) +; PPC32-NEXT: addi 11, 29, 1 +; PPC32-NEXT: stb 11, 12(3) +; PPC32-NEXT: addi 11, 28, 1 +; PPC32-NEXT: stb 11, 11(3) +; PPC32-NEXT: addi 11, 27, 1 +; PPC32-NEXT: stb 11, 10(3) +; PPC32-NEXT: addi 11, 26, 1 +; PPC32-NEXT: stb 11, 9(3) +; PPC32-NEXT: addi 11, 25, 1 +; PPC32-NEXT: stb 8, 6(3) +; PPC32-NEXT: addi 8, 10, 1 +; PPC32-NEXT: stb 11, 8(3) +; PPC32-NEXT: addi 11, 24, 1 +; PPC32-NEXT: stb 8, 5(3) +; PPC32-NEXT: addi 8, 9, 1 +; PPC32-NEXT: stb 11, 7(3) +; PPC32-NEXT: stb 8, 4(3) +; PPC32-NEXT: stb 7, 3(3) +; PPC32-NEXT: stb 6, 2(3) +; PPC32-NEXT: stb 5, 1(3) ; PPC32-NEXT: stb 4, 0(3) ; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload @@ -159,6 +160,7 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 20, 16(1) # 4-byte Folded Reload ; PPC32-NEXT: addi 1, 1, 64 ; PPC32-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir b/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir index 1cc50ab4dcdd5..b9c541feae5ac 100644 --- a/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir +++ b/llvm/test/CodeGen/PowerPC/ldst-16-byte.mir @@ -11,10 +11,11 @@ body: | liveins: $x3, $x4 ; CHECK-LABEL: name: foo ; CHECK: liveins: $x3, $x4 - ; CHECK: early-clobber renamable $g8p3 = LQ 128, $x4 - ; CHECK: $x3 = OR8 $x7, $x7 - ; CHECK: STQ killed renamable $g8p3, 160, $x3 - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber renamable $g8p3 = LQ 128, $x4 + ; CHECK-NEXT: $x3 = OR8 $x7, $x7 + ; CHECK-NEXT: STQ killed renamable $g8p3, 160, $x3 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 %0:g8prc = LQ 128, $x4 $x3 = COPY %0.sub_gp8_x1:g8prc STQ %0, 160, $x3 @@ -30,10 +31,11 @@ body: | liveins: $x3, $x4 ; CHECK-LABEL: name: foobar ; CHECK: liveins: $x3, $x4 - ; CHECK: renamable $g8p3 = LQARX $x3, $x4 - ; CHECK: STQCX renamable $g8p3, $x3, $x4, implicit-def dead $cr0 - ; CHECK: $x3 = OR8 $x7, killed $x7 - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $g8p3 = LQARX $x3, $x4 + ; CHECK-NEXT: STQCX renamable $g8p3, $x3, $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x3 = OR8 $x7, killed $x7 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 %0:g8prc = LQARX $x3, $x4 STQCX %0:g8prc, $x3, $x4, implicit-def $cr0 $x3 = COPY %0.sub_gp8_x1:g8prc @@ -49,10 +51,11 @@ body: | liveins: $x3, $x4 ; CHECK-LABEL: name: bar ; CHECK: liveins: $x3, $x4 - ; CHECK: early-clobber renamable $g8p2 = LQ 128, renamable $x3 - ; CHECK: STQ renamable $g8p2, 160, $x3 - ; CHECK: $x3 = OR8 $x4, killed $x4 - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber renamable $g8p2 = LQ 128, renamable $x3 + ; CHECK-NEXT: STQ renamable $g8p2, 160, $x3 + ; CHECK-NEXT: $x3 = OR8 $x4, killed $x4 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 %0:g8rc_nox0 = COPY $x3 %1:g8prc = LQ 128, %0 STQ %1, 160, $x3 @@ -71,97 +74,98 @@ body: | liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12 ; CHECK-LABEL: name: spill_g8prc ; CHECK: liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $x29, $x30, $x31 - ; CHECK: STD killed $x14, -144, $x1 :: (store (s64) into %fixed-stack.17, align 16) - ; CHECK: STD killed $x15, -136, $x1 :: (store (s64) into %fixed-stack.16) - ; CHECK: STD killed $x16, -128, $x1 :: (store (s64) into %fixed-stack.15, align 16) - ; CHECK: STD killed $x17, -120, $x1 :: (store (s64) into %fixed-stack.14) - ; CHECK: STD killed $x18, -112, $x1 :: (store (s64) into %fixed-stack.13, align 16) - ; CHECK: STD killed $x19, -104, $x1 :: (store (s64) into %fixed-stack.12) - ; CHECK: STD killed $x20, -96, $x1 :: (store (s64) into %fixed-stack.11, align 16) - ; CHECK: STD killed $x21, -88, $x1 :: (store (s64) into %fixed-stack.10) - ; CHECK: STD killed $x22, -80, $x1 :: (store (s64) into %fixed-stack.9, align 16) - ; CHECK: STD killed $x23, -72, $x1 :: (store (s64) into %fixed-stack.8) - ; CHECK: STD killed $x24, -64, $x1 :: (store (s64) into %fixed-stack.7, align 16) - ; CHECK: STD killed $x25, -56, $x1 :: (store (s64) into %fixed-stack.6) - ; CHECK: STD killed $x26, -48, $x1 :: (store (s64) into %fixed-stack.5, align 16) - ; CHECK: STD killed $x27, -40, $x1 :: (store (s64) into %fixed-stack.4) - ; CHECK: STD killed $x28, -32, $x1 :: (store (s64) into %fixed-stack.3, align 16) - ; CHECK: STD killed $x29, -24, $x1 :: (store (s64) into %fixed-stack.2) - ; CHECK: STD killed $x30, -16, $x1 :: (store (s64) into %fixed-stack.1, align 16) - ; CHECK: STD killed $x31, -8, $x1 :: (store (s64) into %fixed-stack.0) - ; CHECK: $x7 = OR8 $x3, $x3 - ; CHECK: renamable $g8p4 = LQARX $x5, $x6 - ; CHECK: STD killed $x8, -160, $x1 - ; CHECK: STD killed $x9, -152, $x1 - ; CHECK: renamable $g8p5 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: STD killed $x8, -176, $x1 - ; CHECK: STD killed $x9, -168, $x1 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: STD killed $x8, -192, $x1 - ; CHECK: STD killed $x9, -184, $x1 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: STD killed $x8, -208, $x1 - ; CHECK: STD killed $x9, -200, $x1 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: STD killed $x8, -224, $x1 - ; CHECK: STD killed $x9, -216, $x1 - ; CHECK: renamable $g8p10 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p9 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p8 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p7 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p15 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p11 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p12 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p13 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p14 = LQARX $x3, renamable $x4 - ; CHECK: renamable $g8p4 = LQARX $x3, renamable $x4 - ; CHECK: $x3 = OR8 $x11, $x11 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p14, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p13, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p12, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p11, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p15, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p7, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p8, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p9, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p10, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -224, $x1 - ; CHECK: $x9 = LD -216, $x1 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -208, $x1 - ; CHECK: $x9 = LD -200, $x1 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -192, $x1 - ; CHECK: $x9 = LD -184, $x1 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -176, $x1 - ; CHECK: $x9 = LD -168, $x1 - ; CHECK: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 - ; CHECK: STQCX killed renamable $g8p5, killed renamable $x7, killed renamable $x4, implicit-def dead $cr0 - ; CHECK: $x8 = LD -160, $x1 - ; CHECK: $x9 = LD -152, $x1 - ; CHECK: STQCX killed renamable $g8p4, $x5, $x6, implicit-def dead $cr0 - ; CHECK: $x31 = LD -8, $x1 :: (load (s64) from %fixed-stack.0) - ; CHECK: $x30 = LD -16, $x1 :: (load (s64) from %fixed-stack.1, align 16) - ; CHECK: $x29 = LD -24, $x1 :: (load (s64) from %fixed-stack.2) - ; CHECK: $x28 = LD -32, $x1 :: (load (s64) from %fixed-stack.3, align 16) - ; CHECK: $x27 = LD -40, $x1 :: (load (s64) from %fixed-stack.4) - ; CHECK: $x26 = LD -48, $x1 :: (load (s64) from %fixed-stack.5, align 16) - ; CHECK: $x25 = LD -56, $x1 :: (load (s64) from %fixed-stack.6) - ; CHECK: $x24 = LD -64, $x1 :: (load (s64) from %fixed-stack.7, align 16) - ; CHECK: $x23 = LD -72, $x1 :: (load (s64) from %fixed-stack.8) - ; CHECK: $x22 = LD -80, $x1 :: (load (s64) from %fixed-stack.9, align 16) - ; CHECK: $x21 = LD -88, $x1 :: (load (s64) from %fixed-stack.10) - ; CHECK: $x20 = LD -96, $x1 :: (load (s64) from %fixed-stack.11, align 16) - ; CHECK: $x19 = LD -104, $x1 :: (load (s64) from %fixed-stack.12) - ; CHECK: $x18 = LD -112, $x1 :: (load (s64) from %fixed-stack.13, align 16) - ; CHECK: $x17 = LD -120, $x1 :: (load (s64) from %fixed-stack.14) - ; CHECK: $x16 = LD -128, $x1 :: (load (s64) from %fixed-stack.15, align 16) - ; CHECK: $x15 = LD -136, $x1 :: (load (s64) from %fixed-stack.16) - ; CHECK: $x14 = LD -144, $x1 :: (load (s64) from %fixed-stack.17, align 16) - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: STD killed $x14, -144, $x1 :: (store (s64) into %fixed-stack.17, align 16) + ; CHECK-NEXT: STD killed $x15, -136, $x1 :: (store (s64) into %fixed-stack.16) + ; CHECK-NEXT: STD killed $x16, -128, $x1 :: (store (s64) into %fixed-stack.15, align 16) + ; CHECK-NEXT: STD killed $x17, -120, $x1 :: (store (s64) into %fixed-stack.14) + ; CHECK-NEXT: STD killed $x18, -112, $x1 :: (store (s64) into %fixed-stack.13, align 16) + ; CHECK-NEXT: STD killed $x19, -104, $x1 :: (store (s64) into %fixed-stack.12) + ; CHECK-NEXT: STD killed $x20, -96, $x1 :: (store (s64) into %fixed-stack.11, align 16) + ; CHECK-NEXT: STD killed $x21, -88, $x1 :: (store (s64) into %fixed-stack.10) + ; CHECK-NEXT: STD killed $x22, -80, $x1 :: (store (s64) into %fixed-stack.9, align 16) + ; CHECK-NEXT: STD killed $x23, -72, $x1 :: (store (s64) into %fixed-stack.8) + ; CHECK-NEXT: STD killed $x24, -64, $x1 :: (store (s64) into %fixed-stack.7, align 16) + ; CHECK-NEXT: STD killed $x25, -56, $x1 :: (store (s64) into %fixed-stack.6) + ; CHECK-NEXT: STD killed $x26, -48, $x1 :: (store (s64) into %fixed-stack.5, align 16) + ; CHECK-NEXT: STD killed $x27, -40, $x1 :: (store (s64) into %fixed-stack.4) + ; CHECK-NEXT: STD killed $x28, -32, $x1 :: (store (s64) into %fixed-stack.3, align 16) + ; CHECK-NEXT: STD killed $x29, -24, $x1 :: (store (s64) into %fixed-stack.2) + ; CHECK-NEXT: STD killed $x30, -16, $x1 :: (store (s64) into %fixed-stack.1, align 16) + ; CHECK-NEXT: STD killed $x31, -8, $x1 :: (store (s64) into %fixed-stack.0) + ; CHECK-NEXT: $x7 = OR8 $x3, $x3 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x5, $x6 + ; CHECK-NEXT: STD killed $x8, -160, $x1 + ; CHECK-NEXT: STD killed $x9, -152, $x1 + ; CHECK-NEXT: renamable $g8p13 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: STD killed $x8, -176, $x1 + ; CHECK-NEXT: STD killed $x9, -168, $x1 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: STD killed $x8, -192, $x1 + ; CHECK-NEXT: STD killed $x9, -184, $x1 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: STD killed $x8, -208, $x1 + ; CHECK-NEXT: STD killed $x9, -200, $x1 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: STD killed $x8, -224, $x1 + ; CHECK-NEXT: STD killed $x9, -216, $x1 + ; CHECK-NEXT: renamable $g8p10 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p9 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p8 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p7 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p15 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p11 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p12 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p14 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p5 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: renamable $g8p4 = LQARX $x3, renamable $x4 + ; CHECK-NEXT: $x3 = OR8 $x27, $x27 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p5, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p14, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p12, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p11, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p15, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p7, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p8, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p9, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p10, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -224, $x1 + ; CHECK-NEXT: $x9 = LD -216, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -208, $x1 + ; CHECK-NEXT: $x9 = LD -200, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -192, $x1 + ; CHECK-NEXT: $x9 = LD -184, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -176, $x1 + ; CHECK-NEXT: $x9 = LD -168, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, renamable $x7, renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: STQCX killed renamable $g8p13, killed renamable $x7, killed renamable $x4, implicit-def dead $cr0 + ; CHECK-NEXT: $x8 = LD -160, $x1 + ; CHECK-NEXT: $x9 = LD -152, $x1 + ; CHECK-NEXT: STQCX killed renamable $g8p4, $x5, $x6, implicit-def dead $cr0 + ; CHECK-NEXT: $x31 = LD -8, $x1 :: (load (s64) from %fixed-stack.0) + ; CHECK-NEXT: $x30 = LD -16, $x1 :: (load (s64) from %fixed-stack.1, align 16) + ; CHECK-NEXT: $x29 = LD -24, $x1 :: (load (s64) from %fixed-stack.2) + ; CHECK-NEXT: $x28 = LD -32, $x1 :: (load (s64) from %fixed-stack.3, align 16) + ; CHECK-NEXT: $x27 = LD -40, $x1 :: (load (s64) from %fixed-stack.4) + ; CHECK-NEXT: $x26 = LD -48, $x1 :: (load (s64) from %fixed-stack.5, align 16) + ; CHECK-NEXT: $x25 = LD -56, $x1 :: (load (s64) from %fixed-stack.6) + ; CHECK-NEXT: $x24 = LD -64, $x1 :: (load (s64) from %fixed-stack.7, align 16) + ; CHECK-NEXT: $x23 = LD -72, $x1 :: (load (s64) from %fixed-stack.8) + ; CHECK-NEXT: $x22 = LD -80, $x1 :: (load (s64) from %fixed-stack.9, align 16) + ; CHECK-NEXT: $x21 = LD -88, $x1 :: (load (s64) from %fixed-stack.10) + ; CHECK-NEXT: $x20 = LD -96, $x1 :: (load (s64) from %fixed-stack.11, align 16) + ; CHECK-NEXT: $x19 = LD -104, $x1 :: (load (s64) from %fixed-stack.12) + ; CHECK-NEXT: $x18 = LD -112, $x1 :: (load (s64) from %fixed-stack.13, align 16) + ; CHECK-NEXT: $x17 = LD -120, $x1 :: (load (s64) from %fixed-stack.14) + ; CHECK-NEXT: $x16 = LD -128, $x1 :: (load (s64) from %fixed-stack.15, align 16) + ; CHECK-NEXT: $x15 = LD -136, $x1 :: (load (s64) from %fixed-stack.16) + ; CHECK-NEXT: $x14 = LD -144, $x1 :: (load (s64) from %fixed-stack.17, align 16) + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit $x3 %addr0:g8rc_nox0 = COPY $x3 %addr1:g8rc = COPY $x4 %0:g8prc = LQARX $x5, $x6 @@ -209,10 +213,11 @@ body: | liveins: $g8p8 ; CHECK-LABEL: name: copy_g8prc ; CHECK: liveins: $g8p8 - ; CHECK: $x4 = OR8 $x16, $x16 - ; CHECK: $x5 = OR8 $x17, $x17 - ; CHECK: $x3 = OR8 $x5, $x5 - ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit killed $x3, implicit $x4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x4 = OR8 $x16, $x16 + ; CHECK-NEXT: $x5 = OR8 $x17, $x17 + ; CHECK-NEXT: $x3 = OR8 $x5, $x5 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit undef $rm, implicit killed $x3, implicit $x4 %0:g8prc = COPY $g8p8 $x3 = COPY %0.sub_gp8_x1:g8prc $x4 = COPY %0.sub_gp8_x0:g8prc diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll index 37baef6043884..900069c6216bf 100644 --- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll @@ -196,13 +196,13 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: # %bb.1: # %bb3.preheader ; CHECK-NEXT: cmpldi r4, 1 ; CHECK-NEXT: li r5, 1 -; CHECK-NEXT: addi r9, r3, 4002 +; CHECK-NEXT: addi r10, r3, 4002 ; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; CHECK-NEXT: li r6, -1 ; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill ; CHECK-NEXT: li r7, 3 ; CHECK-NEXT: li r8, 5 -; CHECK-NEXT: li r10, 9 +; CHECK-NEXT: li r9, 9 ; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill @@ -213,17 +213,17 @@ define i64 @test_max_number_reminder(ptr %arg, i32 signext %arg1) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB2_2: # %bb3 ; CHECK-NEXT: # -; CHECK-NEXT: ldx r11, r9, r6 -; CHECK-NEXT: ld r12, 0(r9) -; CHECK-NEXT: ldx r0, r9, r5 -; CHECK-NEXT: ldx r30, r9, r7 +; CHECK-NEXT: ldx r11, r10, r6 +; CHECK-NEXT: ld r12, 0(r10) +; CHECK-NEXT: ldx r0, r10, r5 +; CHECK-NEXT: ldx r30, r10, r7 ; CHECK-NEXT: mulld r11, r12, r11 -; CHECK-NEXT: ld r29, 4(r9) -; CHECK-NEXT: ldx r28, r9, r8 -; CHECK-NEXT: ld r27, 12(r9) -; CHECK-NEXT: ld r26, 8(r9) -; CHECK-NEXT: ldx r25, r9, r10 -; CHECK-NEXT: addi r9, r9, 1 +; CHECK-NEXT: ld r29, 4(r10) +; CHECK-NEXT: ldx r28, r10, r8 +; CHECK-NEXT: ld r27, 12(r10) +; CHECK-NEXT: ld r26, 8(r10) +; CHECK-NEXT: ldx r25, r10, r9 +; CHECK-NEXT: addi r10, r10, 1 ; CHECK-NEXT: mulld r11, r11, r0 ; CHECK-NEXT: mulld r11, r11, r30 ; CHECK-NEXT: mulld r11, r11, r29 diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll index 5d2232319c1f5..9f62477ae01df 100644 --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -60,122 +60,121 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: std 23, 472(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 22, 5 ; CHECK-NEXT: ld 5, 848(1) +; CHECK-NEXT: addi 3, 3, 1 ; CHECK-NEXT: mr 11, 7 ; CHECK-NEXT: ld 23, 688(1) -; CHECK-NEXT: addi 3, 3, 1 -; CHECK-NEXT: ld 2, 760(1) -; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill -; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 28, 824(1) ; CHECK-NEXT: ld 7, 728(1) ; CHECK-NEXT: std 18, 432(1) # 8-byte Folded Spill ; CHECK-NEXT: std 19, 440(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 18, 6 -; CHECK-NEXT: ld 6, 712(1) -; CHECK-NEXT: cmpldi 3, 9 +; CHECK-NEXT: li 6, 9 ; CHECK-NEXT: ld 19, 768(1) -; CHECK-NEXT: std 10, 64(1) # 8-byte Folded Spill -; CHECK-NEXT: std 6, 72(1) # 8-byte Folded Spill -; CHECK-NEXT: std 5, 200(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 840(1) -; CHECK-NEXT: lxv 33, 0(6) +; CHECK-NEXT: ld 2, 760(1) +; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill +; CHECK-NEXT: cmpldi 3, 9 +; CHECK-NEXT: ld 27, 816(1) +; CHECK-NEXT: ld 26, 808(1) ; CHECK-NEXT: std 14, 400(1) # 8-byte Folded Spill ; CHECK-NEXT: std 15, 408(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 15, 736(1) +; CHECK-NEXT: lxv 39, 0(8) ; CHECK-NEXT: std 30, 528(1) # 8-byte Folded Spill ; CHECK-NEXT: std 31, 536(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 30, 704(1) +; CHECK-NEXT: lxv 38, 0(9) ; CHECK-NEXT: std 20, 448(1) # 8-byte Folded Spill ; CHECK-NEXT: std 21, 456(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 21, 784(1) ; CHECK-NEXT: ld 20, 776(1) -; CHECK-NEXT: lxv 10, 0(19) -; CHECK-NEXT: lxv 7, 0(21) -; CHECK-NEXT: ld 15, 736(1) -; CHECK-NEXT: ld 29, 704(1) -; CHECK-NEXT: ld 30, 720(1) -; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill +; CHECK-NEXT: iselgt 3, 3, 6 +; CHECK-NEXT: ld 6, 720(1) +; CHECK-NEXT: ld 24, 792(1) +; CHECK-NEXT: std 10, 72(1) # 8-byte Folded Spill +; CHECK-NEXT: std 7, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: addi 3, 3, -2 +; CHECK-NEXT: lxv 6, 0(19) +; CHECK-NEXT: lxv 11, 0(7) +; CHECK-NEXT: std 5, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: std 23, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: std 6, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 5, 840(1) +; CHECK-NEXT: lxv 12, 0(6) +; CHECK-NEXT: rldicl 12, 3, 61, 3 ; CHECK-NEXT: std 19, 120(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 42, 0(9) -; CHECK-NEXT: lxv 37, 0(7) ; CHECK-NEXT: std 20, 128(1) # 8-byte Folded Spill ; CHECK-NEXT: std 21, 136(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 43, 0(8) -; CHECK-NEXT: lxv 41, 0(10) +; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 4, 0(21) +; CHECK-NEXT: ld 25, 800(1) +; CHECK-NEXT: lxv 33, 0(10) +; CHECK-NEXT: lxv 32, 0(23) +; CHECK-NEXT: lxv 36, 0(30) ; CHECK-NEXT: std 16, 416(1) # 8-byte Folded Spill ; CHECK-NEXT: std 17, 424(1) # 8-byte Folded Spill ; CHECK-NEXT: ld 17, 752(1) ; CHECK-NEXT: ld 16, 744(1) -; CHECK-NEXT: std 24, 480(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 488(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 25, 800(1) -; CHECK-NEXT: ld 24, 792(1) -; CHECK-NEXT: std 26, 496(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 504(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 27, 816(1) -; CHECK-NEXT: ld 26, 808(1) -; CHECK-NEXT: std 8, 48(1) # 8-byte Folded Spill -; CHECK-NEXT: std 9, 56(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 40, 0(23) -; CHECK-NEXT: lxv 38, 0(29) -; CHECK-NEXT: std 7, 80(1) # 8-byte Folded Spill -; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 32, 0(30) -; CHECK-NEXT: lxv 36, 0(15) +; CHECK-NEXT: std 28, 512(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 520(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 29, 712(1) +; CHECK-NEXT: ld 28, 696(1) +; CHECK-NEXT: std 8, 56(1) # 8-byte Folded Spill +; CHECK-NEXT: std 9, 64(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 37, 0(28) +; CHECK-NEXT: lxv 13, 0(29) ; CHECK-NEXT: mr 8, 29 -; CHECK-NEXT: mr 10, 30 +; CHECK-NEXT: mr 9, 30 +; CHECK-NEXT: mr 10, 28 +; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill ; CHECK-NEXT: std 26, 160(1) # 8-byte Folded Spill -; CHECK-NEXT: std 27, 168(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 13, 0(16) -; CHECK-NEXT: lxv 12, 0(17) +; CHECK-NEXT: lxv 10, 0(15) +; CHECK-NEXT: lxv 9, 0(16) +; CHECK-NEXT: li 28, 1 ; CHECK-NEXT: stfd 26, 544(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 27, 552(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 11, 0(2) -; CHECK-NEXT: lxv 9, 0(20) +; CHECK-NEXT: lxv 8, 0(17) +; CHECK-NEXT: lxv 7, 0(2) ; CHECK-NEXT: stfd 28, 560(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 29, 568(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 5, 0(24) -; CHECK-NEXT: lxv 4, 0(25) +; CHECK-NEXT: lxv 5, 0(20) +; CHECK-NEXT: lxv 3, 0(24) ; CHECK-NEXT: stfd 30, 576(1) # 8-byte Folded Spill ; CHECK-NEXT: stfd 31, 584(1) # 8-byte Folded Spill -; CHECK-NEXT: lxv 2, 0(26) -; CHECK-NEXT: lxv 0, 0(27) -; CHECK-NEXT: li 27, 0 +; CHECK-NEXT: lxv 2, 0(25) +; CHECK-NEXT: lxv 1, 0(26) ; CHECK-NEXT: stxv 52, 208(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 53, 224(1) # 16-byte Folded Spill -; CHECK-NEXT: lxv 1, 0(28) +; CHECK-NEXT: lxv 0, 0(27) ; CHECK-NEXT: stxv 54, 240(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 55, 256(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 56, 272(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 57, 288(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 58, 304(1) # 16-byte Folded Spill +; CHECK-NEXT: std 5, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 5, 832(1) ; CHECK-NEXT: stxv 59, 320(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 60, 336(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 61, 352(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 62, 368(1) # 16-byte Folded Spill ; CHECK-NEXT: stxv 63, 384(1) # 16-byte Folded Spill -; CHECK-NEXT: std 5, 192(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 832(1) +; CHECK-NEXT: std 15, 88(1) # 8-byte Folded Spill ; CHECK-NEXT: std 16, 96(1) # 8-byte Folded Spill ; CHECK-NEXT: std 17, 104(1) # 8-byte Folded Spill -; CHECK-NEXT: std 24, 144(1) # 8-byte Folded Spill -; CHECK-NEXT: std 25, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: std 2, 112(1) # 8-byte Folded Spill ; CHECK-NEXT: std 5, 184(1) # 8-byte Folded Spill -; CHECK-NEXT: std 28, 176(1) # 8-byte Folded Spill -; CHECK-NEXT: ld 5, 696(1) -; CHECK-NEXT: li 28, 1 -; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 3, 0(7) -; CHECK-NEXT: std 5, 32(1) # 8-byte Folded Spill -; CHECK-NEXT: std 23, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: ld 5, 824(1) +; CHECK-NEXT: std 5, 176(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 168(1) # 8-byte Folded Spill ; CHECK-NEXT: lwa 5, 0(11) -; CHECK-NEXT: li 11, 9 -; CHECK-NEXT: ld 9, 32(1) # 8-byte Folded Reload -; CHECK-NEXT: iselgt 3, 3, 11 -; CHECK-NEXT: addi 3, 3, -2 +; CHECK-NEXT: li 27, 0 +; CHECK-NEXT: ld 7, 176(1) # 8-byte Folded Reload ; CHECK-NEXT: mulli 6, 5, 40 ; CHECK-NEXT: sldi 0, 5, 4 ; CHECK-NEXT: extswsli 14, 5, 3 -; CHECK-NEXT: rldicl 12, 3, 61, 3 -; CHECK-NEXT: lxv 39, 0(9) +; CHECK-NEXT: lxv 40, 0(7) +; CHECK-NEXT: ld 7, 184(1) # 8-byte Folded Reload ; CHECK-NEXT: add 31, 14, 22 ; CHECK-NEXT: add 11, 0, 22 ; CHECK-NEXT: mr 26, 22 @@ -186,12 +185,13 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: add 19, 22, 6 ; CHECK-NEXT: sldi 6, 5, 5 ; CHECK-NEXT: mulli 5, 5, 24 +; CHECK-NEXT: lxv 41, 0(7) ; CHECK-NEXT: add 20, 22, 6 ; CHECK-NEXT: add 21, 22, 5 ; CHECK-NEXT: ld 5, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 8, 0(5) +; CHECK-NEXT: lxv 43, 0(5) ; CHECK-NEXT: ld 5, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 6, 0(5) +; CHECK-NEXT: lxv 42, 0(5) ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: # %_loop_2_do_.lr.ph ; CHECK-NEXT: # =>This Loop Header: Depth=1 @@ -212,9 +212,9 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: # => This Inner Loop Header: Depth=2 ; CHECK-NEXT: lxvp 34, 0(6) ; CHECK-NEXT: lxvp 44, 0(5) -; CHECK-NEXT: xvmaddadp 43, 45, 35 +; CHECK-NEXT: xvmaddadp 39, 45, 35 ; CHECK-NEXT: lxvp 46, 0(24) -; CHECK-NEXT: xvmaddadp 42, 47, 35 +; CHECK-NEXT: xvmaddadp 38, 47, 35 ; CHECK-NEXT: lxvp 48, 0(25) ; CHECK-NEXT: lxvp 50, 0(29) ; CHECK-NEXT: lxvp 62, 0(30) @@ -226,28 +226,28 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: lxvp 30, 32(29) ; CHECK-NEXT: lxvp 28, 32(30) ; CHECK-NEXT: lxvp 26, 32(2) -; CHECK-NEXT: xvmaddadp 41, 49, 35 -; CHECK-NEXT: xvmaddadp 40, 51, 35 -; CHECK-NEXT: xvmaddadp 39, 63, 35 -; CHECK-NEXT: xvmaddadp 38, 61, 35 -; CHECK-NEXT: xvmaddadp 33, 44, 34 -; CHECK-NEXT: xvmaddadp 32, 46, 34 -; CHECK-NEXT: xvmaddadp 37, 48, 34 -; CHECK-NEXT: xvmaddadp 36, 50, 34 -; CHECK-NEXT: xvmaddadp 13, 62, 34 -; CHECK-NEXT: xvmaddadp 12, 60, 34 -; CHECK-NEXT: xvmaddadp 11, 57, 59 -; CHECK-NEXT: xvmaddadp 10, 55, 59 -; CHECK-NEXT: xvmaddadp 9, 53, 59 -; CHECK-NEXT: xvmaddadp 7, 31, 59 -; CHECK-NEXT: xvmaddadp 5, 29, 59 -; CHECK-NEXT: xvmaddadp 4, 27, 59 -; CHECK-NEXT: xvmaddadp 2, 56, 58 +; CHECK-NEXT: xvmaddadp 33, 49, 35 +; CHECK-NEXT: xvmaddadp 32, 51, 35 +; CHECK-NEXT: xvmaddadp 37, 63, 35 +; CHECK-NEXT: xvmaddadp 36, 61, 35 +; CHECK-NEXT: xvmaddadp 13, 44, 34 +; CHECK-NEXT: xvmaddadp 12, 46, 34 +; CHECK-NEXT: xvmaddadp 11, 48, 34 +; CHECK-NEXT: xvmaddadp 10, 50, 34 +; CHECK-NEXT: xvmaddadp 9, 62, 34 +; CHECK-NEXT: xvmaddadp 8, 60, 34 +; CHECK-NEXT: xvmaddadp 7, 57, 59 +; CHECK-NEXT: xvmaddadp 6, 55, 59 +; CHECK-NEXT: xvmaddadp 5, 53, 59 +; CHECK-NEXT: xvmaddadp 4, 31, 59 +; CHECK-NEXT: xvmaddadp 3, 29, 59 +; CHECK-NEXT: xvmaddadp 2, 27, 59 +; CHECK-NEXT: xvmaddadp 1, 56, 58 ; CHECK-NEXT: xvmaddadp 0, 54, 58 -; CHECK-NEXT: xvmaddadp 1, 52, 58 -; CHECK-NEXT: xvmaddadp 3, 30, 58 -; CHECK-NEXT: xvmaddadp 8, 28, 58 -; CHECK-NEXT: xvmaddadp 6, 26, 58 +; CHECK-NEXT: xvmaddadp 40, 52, 58 +; CHECK-NEXT: xvmaddadp 41, 30, 58 +; CHECK-NEXT: xvmaddadp 43, 28, 58 +; CHECK-NEXT: xvmaddadp 42, 26, 58 ; CHECK-NEXT: addi 6, 6, 64 ; CHECK-NEXT: addi 5, 5, 64 ; CHECK-NEXT: addi 24, 24, 64 @@ -269,10 +269,10 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: cmpld 28, 4 ; CHECK-NEXT: ble 0, .LBB0_3 ; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit -; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload -; CHECK-NEXT: lxv 63, 384(1) # 16-byte Folded Reload -; CHECK-NEXT: stxv 43, 0(3) ; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload +; CHECK-NEXT: lxv 63, 384(1) # 16-byte Folded Reload +; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload ; CHECK-NEXT: lxv 62, 368(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 61, 352(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 60, 336(1) # 16-byte Folded Reload @@ -284,8 +284,8 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: lxv 54, 240(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 53, 224(1) # 16-byte Folded Reload ; CHECK-NEXT: lxv 52, 208(1) # 16-byte Folded Reload -; CHECK-NEXT: stxv 42, 0(3) -; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 31, 584(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 30, 576(1) # 8-byte Folded Reload ; CHECK-NEXT: lfd 29, 568(1) # 8-byte Folded Reload @@ -297,7 +297,7 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 29, 520(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 28, 512(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 27, 504(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 41, 0(3) +; CHECK-NEXT: stxv 33, 0(3) ; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 26, 496(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 25, 488(1) # 8-byte Folded Reload @@ -310,46 +310,46 @@ define void @foo(ptr %.m, ptr %.n, ptr %.a, ptr %.x, ptr %.l, ptr %.vy01, ptr %. ; CHECK-NEXT: ld 18, 432(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 17, 424(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 16, 416(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 40, 0(3) -; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 39, 0(9) -; CHECK-NEXT: stxv 38, 0(8) +; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 37, 0(10) +; CHECK-NEXT: stxv 36, 0(9) +; CHECK-NEXT: stxv 13, 0(8) ; CHECK-NEXT: ld 15, 408(1) # 8-byte Folded Reload ; CHECK-NEXT: ld 14, 400(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 33, 0(3) +; CHECK-NEXT: stxv 12, 0(3) ; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 32, 0(10) -; CHECK-NEXT: stxv 37, 0(3) +; CHECK-NEXT: stxv 11, 0(3) ; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: stxv 10, 0(3) ; CHECK-NEXT: ld 3, 96(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 13, 0(3) +; CHECK-NEXT: stxv 9, 0(3) ; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 12, 0(3) +; CHECK-NEXT: stxv 8, 0(3) ; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 11, 0(3) +; CHECK-NEXT: stxv 7, 0(3) ; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 10, 0(3) +; CHECK-NEXT: stxv 6, 0(3) ; CHECK-NEXT: ld 3, 128(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 9, 0(3) +; CHECK-NEXT: stxv 5, 0(3) ; CHECK-NEXT: ld 3, 136(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: stxv 4, 0(3) ; CHECK-NEXT: ld 3, 144(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 5, 0(3) +; CHECK-NEXT: stxv 3, 0(3) ; CHECK-NEXT: ld 3, 152(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 4, 0(3) -; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 2, 0(3) +; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 1, 0(3) ; CHECK-NEXT: ld 3, 168(1) # 8-byte Folded Reload ; CHECK-NEXT: stxv 0, 0(3) ; CHECK-NEXT: ld 3, 176(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 1, 0(3) +; CHECK-NEXT: stxv 40, 0(3) ; CHECK-NEXT: ld 3, 184(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 3, 0(3) +; CHECK-NEXT: stxv 41, 0(3) ; CHECK-NEXT: ld 3, 192(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: stxv 43, 0(3) ; CHECK-NEXT: ld 3, 200(1) # 8-byte Folded Reload -; CHECK-NEXT: stxv 6, 0(3) +; CHECK-NEXT: stxv 42, 0(3) ; CHECK-NEXT: addi 1, 1, 592 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll index 3d2b6bc4da2a9..7a6640fea2d1e 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll @@ -280,37 +280,37 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR9-LE-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; CHECK-PWR9-LE-NEXT: clrlwi r6, r6, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r3, r3, 24 +; CHECK-PWR9-LE-NEXT: clrlwi r8, r8, 24 +; CHECK-PWR9-LE-NEXT: clrlwi r5, r5, 24 ; CHECK-PWR9-LE-NEXT: vextubrx r7, r4, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r4, r4, v3 -; CHECK-PWR9-LE-NEXT: clrlwi r8, r8, 24 ; CHECK-PWR9-LE-NEXT: sub r3, r6, r3 -; CHECK-PWR9-LE-NEXT: clrlwi r5, r5, 24 +; CHECK-PWR9-LE-NEXT: sub r6, r8, r5 ; CHECK-PWR9-LE-NEXT: clrlwi r7, r7, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r4, r4, 24 -; CHECK-PWR9-LE-NEXT: sub r5, r8, r5 ; CHECK-PWR9-LE-NEXT: sub r4, r7, r4 -; CHECK-PWR9-LE-NEXT: srawi r6, r3, 31 +; CHECK-PWR9-LE-NEXT: srawi r5, r3, 31 ; CHECK-PWR9-LE-NEXT: srawi r7, r4, 31 -; CHECK-PWR9-LE-NEXT: xor r3, r3, r6 +; CHECK-PWR9-LE-NEXT: xor r3, r3, r5 ; CHECK-PWR9-LE-NEXT: xor r4, r4, r7 -; CHECK-PWR9-LE-NEXT: sub r6, r3, r6 -; CHECK-PWR9-LE-NEXT: srawi r3, r5, 31 +; CHECK-PWR9-LE-NEXT: sub r5, r3, r5 +; CHECK-PWR9-LE-NEXT: srawi r3, r6, 31 ; CHECK-PWR9-LE-NEXT: sub r4, r4, r7 -; CHECK-PWR9-LE-NEXT: xor r5, r5, r3 -; CHECK-PWR9-LE-NEXT: sub r3, r5, r3 -; CHECK-PWR9-LE-NEXT: li r5, 3 -; CHECK-PWR9-LE-NEXT: vextubrx r7, r5, v2 -; CHECK-PWR9-LE-NEXT: vextubrx r5, r5, v3 +; CHECK-PWR9-LE-NEXT: xor r6, r6, r3 +; CHECK-PWR9-LE-NEXT: sub r3, r6, r3 +; CHECK-PWR9-LE-NEXT: li r6, 3 +; CHECK-PWR9-LE-NEXT: vextubrx r7, r6, v2 +; CHECK-PWR9-LE-NEXT: vextubrx r6, r6, v3 ; CHECK-PWR9-LE-NEXT: clrlwi r7, r7, 24 -; CHECK-PWR9-LE-NEXT: clrlwi r5, r5, 24 -; CHECK-PWR9-LE-NEXT: sub r5, r7, r5 -; CHECK-PWR9-LE-NEXT: srawi r7, r5, 31 -; CHECK-PWR9-LE-NEXT: xor r5, r5, r7 -; CHECK-PWR9-LE-NEXT: sub r5, r5, r7 +; CHECK-PWR9-LE-NEXT: clrlwi r6, r6, 24 +; CHECK-PWR9-LE-NEXT: sub r6, r7, r6 +; CHECK-PWR9-LE-NEXT: srawi r7, r6, 31 +; CHECK-PWR9-LE-NEXT: xor r6, r6, r7 +; CHECK-PWR9-LE-NEXT: sub r6, r6, r7 ; CHECK-PWR9-LE-NEXT: li r7, 4 ; CHECK-PWR9-LE-NEXT: vextubrx r8, r7, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r7, r7, v3 -; CHECK-PWR9-LE-NEXT: mtvsrd v4, r5 +; CHECK-PWR9-LE-NEXT: mtvsrd v4, r6 ; CHECK-PWR9-LE-NEXT: clrlwi r8, r8, 24 ; CHECK-PWR9-LE-NEXT: clrlwi r7, r7, 24 ; CHECK-PWR9-LE-NEXT: sub r7, r8, r7 @@ -411,7 +411,7 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR9-LE-NEXT: li r26, 15 ; CHECK-PWR9-LE-NEXT: vextubrx r25, r26, v2 ; CHECK-PWR9-LE-NEXT: vextubrx r26, r26, v3 -; CHECK-PWR9-LE-NEXT: mtvsrd v2, r6 +; CHECK-PWR9-LE-NEXT: mtvsrd v2, r5 ; CHECK-PWR9-LE-NEXT: mtvsrd v3, r4 ; CHECK-PWR9-LE-NEXT: vmrghb v2, v3, v2 ; CHECK-PWR9-LE-NEXT: mtvsrd v3, r3 @@ -652,87 +652,95 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8: # %bb.0: # %entry ; CHECK-PWR8-NEXT: xxswapd vs0, v2 ; CHECK-PWR8-NEXT: xxswapd vs1, v3 +; CHECK-PWR8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-PWR8-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; CHECK-PWR8-NEXT: mffprd r11, f0 +; CHECK-PWR8-NEXT: mffprd r8, f1 ; CHECK-PWR8-NEXT: std r27, -40(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: mffprd r5, f0 -; CHECK-PWR8-NEXT: mffprd r11, f1 ; CHECK-PWR8-NEXT: std r25, -56(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: std r26, -48(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: clrldi r3, r5, 56 -; CHECK-PWR8-NEXT: clrldi r4, r11, 56 -; CHECK-PWR8-NEXT: rldicl r6, r5, 56, 56 -; CHECK-PWR8-NEXT: rldicl r7, r11, 56, 56 -; CHECK-PWR8-NEXT: rldicl r10, r5, 40, 56 -; CHECK-PWR8-NEXT: rldicl r12, r11, 40, 56 -; CHECK-PWR8-NEXT: rldicl r8, r5, 48, 56 -; CHECK-PWR8-NEXT: rldicl r9, r11, 48, 56 -; CHECK-PWR8-NEXT: rldicl r29, r5, 24, 56 -; CHECK-PWR8-NEXT: rldicl r28, r11, 24, 56 -; CHECK-PWR8-NEXT: rldicl r27, r5, 16, 56 -; CHECK-PWR8-NEXT: rldicl r0, r5, 32, 56 -; CHECK-PWR8-NEXT: rldicl r30, r11, 32, 56 -; CHECK-PWR8-NEXT: rldicl r5, r5, 8, 56 +; CHECK-PWR8-NEXT: clrldi r3, r11, 56 +; CHECK-PWR8-NEXT: clrldi r4, r8, 56 +; CHECK-PWR8-NEXT: rldicl r5, r11, 56, 56 +; CHECK-PWR8-NEXT: rldicl r6, r8, 56, 56 +; CHECK-PWR8-NEXT: rldicl r7, r11, 48, 56 +; CHECK-PWR8-NEXT: rldicl r9, r8, 48, 56 +; CHECK-PWR8-NEXT: rldicl r0, r11, 32, 56 +; CHECK-PWR8-NEXT: rldicl r30, r8, 32, 56 +; CHECK-PWR8-NEXT: rldicl r29, r11, 24, 56 +; CHECK-PWR8-NEXT: rldicl r28, r8, 24, 56 +; CHECK-PWR8-NEXT: rldicl r10, r11, 40, 56 +; CHECK-PWR8-NEXT: rldicl r12, r8, 40, 56 +; CHECK-PWR8-NEXT: rldicl r27, r11, 16, 56 +; CHECK-PWR8-NEXT: rldicl r11, r11, 8, 56 ; CHECK-PWR8-NEXT: std r24, -64(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: clrlwi r3, r3, 24 ; CHECK-PWR8-NEXT: clrlwi r4, r4, 24 +; CHECK-PWR8-NEXT: clrlwi r5, r5, 24 ; CHECK-PWR8-NEXT: clrlwi r6, r6, 24 ; CHECK-PWR8-NEXT: clrlwi r7, r7, 24 -; CHECK-PWR8-NEXT: clrlwi r10, r10, 24 -; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 -; CHECK-PWR8-NEXT: sub r3, r3, r4 -; CHECK-PWR8-NEXT: sub r4, r6, r7 -; CHECK-PWR8-NEXT: sub r7, r10, r12 -; CHECK-PWR8-NEXT: clrlwi r8, r8, 24 ; CHECK-PWR8-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR8-NEXT: sub r3, r3, r4 +; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 +; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 +; CHECK-PWR8-NEXT: sub r4, r5, r6 +; CHECK-PWR8-NEXT: sub r5, r7, r9 ; CHECK-PWR8-NEXT: clrlwi r29, r29, 24 ; CHECK-PWR8-NEXT: clrlwi r28, r28, 24 -; CHECK-PWR8-NEXT: sub r6, r8, r9 +; CHECK-PWR8-NEXT: sub r7, r0, r30 ; CHECK-PWR8-NEXT: sub r9, r29, r28 +; CHECK-PWR8-NEXT: clrlwi r10, r10, 24 +; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 +; CHECK-PWR8-NEXT: sub r6, r10, r12 ; CHECK-PWR8-NEXT: clrlwi r27, r27, 24 -; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 -; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 -; CHECK-PWR8-NEXT: sub r8, r0, r30 -; CHECK-PWR8-NEXT: clrlwi r5, r5, 24 -; CHECK-PWR8-NEXT: srawi r10, r3, 31 +; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 +; CHECK-PWR8-NEXT: srawi r0, r5, 31 +; CHECK-PWR8-NEXT: srawi r29, r7, 31 ; CHECK-PWR8-NEXT: srawi r12, r4, 31 ; CHECK-PWR8-NEXT: srawi r28, r9, 31 -; CHECK-PWR8-NEXT: srawi r0, r6, 31 -; CHECK-PWR8-NEXT: srawi r29, r8, 31 -; CHECK-PWR8-NEXT: srawi r30, r7, 31 -; CHECK-PWR8-NEXT: xor r3, r3, r10 -; CHECK-PWR8-NEXT: sub r10, r3, r10 -; CHECK-PWR8-NEXT: rldicl r3, r11, 16, 56 +; CHECK-PWR8-NEXT: srawi r30, r6, 31 +; CHECK-PWR8-NEXT: srawi r10, r3, 31 +; CHECK-PWR8-NEXT: xor r5, r5, r0 +; CHECK-PWR8-NEXT: xor r26, r7, r29 +; CHECK-PWR8-NEXT: sub r7, r5, r0 +; CHECK-PWR8-NEXT: rldicl r5, r8, 16, 56 +; CHECK-PWR8-NEXT: rldicl r8, r8, 8, 56 ; CHECK-PWR8-NEXT: xor r4, r4, r12 -; CHECK-PWR8-NEXT: rldicl r11, r11, 8, 56 ; CHECK-PWR8-NEXT: xor r25, r9, r28 ; CHECK-PWR8-NEXT: sub r9, r4, r12 -; CHECK-PWR8-NEXT: sub r4, r25, r28 +; CHECK-PWR8-NEXT: sub r4, r26, r29 ; CHECK-PWR8-NEXT: mtvsrd v1, r9 -; CHECK-PWR8-NEXT: clrlwi r3, r3, 24 -; CHECK-PWR8-NEXT: mtvsrd v7, r4 -; CHECK-PWR8-NEXT: sub r3, r27, r3 -; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR8-NEXT: xor r6, r6, r0 -; CHECK-PWR8-NEXT: sub r5, r5, r11 -; CHECK-PWR8-NEXT: xor r26, r8, r29 -; CHECK-PWR8-NEXT: sub r8, r6, r0 -; CHECK-PWR8-NEXT: mfvsrd r0, v3 -; CHECK-PWR8-NEXT: xor r7, r7, r30 -; CHECK-PWR8-NEXT: sub r7, r7, r30 -; CHECK-PWR8-NEXT: sub r6, r26, r29 -; CHECK-PWR8-NEXT: mtvsrd v6, r7 -; CHECK-PWR8-NEXT: clrldi r30, r0, 56 -; CHECK-PWR8-NEXT: rldicl r29, r0, 56, 56 -; CHECK-PWR8-NEXT: rldicl r28, r0, 48, 56 -; CHECK-PWR8-NEXT: rldicl r27, r0, 40, 56 -; CHECK-PWR8-NEXT: rldicl r26, r0, 32, 56 -; CHECK-PWR8-NEXT: rldicl r25, r0, 24, 56 -; CHECK-PWR8-NEXT: rldicl r24, r0, 16, 56 -; CHECK-PWR8-NEXT: rldicl r0, r0, 8, 56 -; CHECK-PWR8-NEXT: srawi r12, r3, 31 -; CHECK-PWR8-NEXT: srawi r11, r5, 31 +; CHECK-PWR8-NEXT: clrlwi r5, r5, 24 +; CHECK-PWR8-NEXT: sub r5, r27, r5 +; CHECK-PWR8-NEXT: clrlwi r8, r8, 24 +; CHECK-PWR8-NEXT: sub r8, r11, r8 +; CHECK-PWR8-NEXT: xor r6, r6, r30 +; CHECK-PWR8-NEXT: sub r6, r6, r30 +; CHECK-PWR8-NEXT: xor r3, r3, r10 +; CHECK-PWR8-NEXT: sub r10, r3, r10 +; CHECK-PWR8-NEXT: sub r3, r25, r28 +; CHECK-PWR8-NEXT: mtvsrd v6, r6 +; CHECK-PWR8-NEXT: mtvsrd v7, r3 +; CHECK-PWR8-NEXT: srawi r12, r5, 31 +; CHECK-PWR8-NEXT: srawi r11, r8, 31 +; CHECK-PWR8-NEXT: xor r5, r5, r12 +; CHECK-PWR8-NEXT: xor r8, r8, r11 +; CHECK-PWR8-NEXT: sub r5, r5, r12 +; CHECK-PWR8-NEXT: sub r8, r8, r11 +; CHECK-PWR8-NEXT: mfvsrd r11, v2 +; CHECK-PWR8-NEXT: mfvsrd r12, v3 +; CHECK-PWR8-NEXT: mtvsrd v8, r8 +; CHECK-PWR8-NEXT: clrldi r0, r11, 56 +; CHECK-PWR8-NEXT: clrldi r30, r12, 56 +; CHECK-PWR8-NEXT: rldicl r29, r12, 56, 56 +; CHECK-PWR8-NEXT: rldicl r28, r12, 48, 56 +; CHECK-PWR8-NEXT: rldicl r27, r12, 40, 56 +; CHECK-PWR8-NEXT: rldicl r26, r12, 32, 56 +; CHECK-PWR8-NEXT: rldicl r25, r12, 24, 56 +; CHECK-PWR8-NEXT: rldicl r24, r12, 16, 56 +; CHECK-PWR8-NEXT: rldicl r12, r12, 8, 56 +; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 ; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 ; CHECK-PWR8-NEXT: clrlwi r29, r29, 24 ; CHECK-PWR8-NEXT: clrlwi r28, r28, 24 @@ -740,27 +748,19 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: clrlwi r26, r26, 24 ; CHECK-PWR8-NEXT: clrlwi r25, r25, 24 ; CHECK-PWR8-NEXT: clrlwi r24, r24, 24 -; CHECK-PWR8-NEXT: clrlwi r0, r0, 24 -; CHECK-PWR8-NEXT: xor r3, r3, r12 -; CHECK-PWR8-NEXT: sub r3, r3, r12 -; CHECK-PWR8-NEXT: mfvsrd r12, v2 -; CHECK-PWR8-NEXT: xor r5, r5, r11 -; CHECK-PWR8-NEXT: sub r5, r5, r11 -; CHECK-PWR8-NEXT: mtvsrd v8, r5 -; CHECK-PWR8-NEXT: clrldi r11, r12, 56 -; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR8-NEXT: sub r11, r11, r30 -; CHECK-PWR8-NEXT: srawi r30, r11, 31 -; CHECK-PWR8-NEXT: xor r11, r11, r30 -; CHECK-PWR8-NEXT: sub r11, r11, r30 -; CHECK-PWR8-NEXT: rldicl r30, r12, 56, 56 +; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 +; CHECK-PWR8-NEXT: sub r0, r0, r30 +; CHECK-PWR8-NEXT: srawi r30, r0, 31 +; CHECK-PWR8-NEXT: xor r0, r0, r30 +; CHECK-PWR8-NEXT: sub r0, r0, r30 +; CHECK-PWR8-NEXT: rldicl r30, r11, 56, 56 ; CHECK-PWR8-NEXT: clrlwi r30, r30, 24 -; CHECK-PWR8-NEXT: mtvsrd v2, r11 +; CHECK-PWR8-NEXT: mtvsrd v2, r0 ; CHECK-PWR8-NEXT: sub r30, r30, r29 ; CHECK-PWR8-NEXT: srawi r29, r30, 31 ; CHECK-PWR8-NEXT: xor r30, r30, r29 ; CHECK-PWR8-NEXT: sub r30, r30, r29 -; CHECK-PWR8-NEXT: rldicl r29, r12, 48, 56 +; CHECK-PWR8-NEXT: rldicl r29, r11, 48, 56 ; CHECK-PWR8-NEXT: clrlwi r29, r29, 24 ; CHECK-PWR8-NEXT: mtvsrd v3, r30 ; CHECK-PWR8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -768,13 +768,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: srawi r28, r29, 31 ; CHECK-PWR8-NEXT: xor r29, r29, r28 ; CHECK-PWR8-NEXT: sub r29, r29, r28 -; CHECK-PWR8-NEXT: rldicl r28, r12, 40, 56 +; CHECK-PWR8-NEXT: rldicl r28, r11, 40, 56 ; CHECK-PWR8-NEXT: clrlwi r28, r28, 24 ; CHECK-PWR8-NEXT: sub r28, r28, r27 ; CHECK-PWR8-NEXT: srawi r27, r28, 31 ; CHECK-PWR8-NEXT: xor r28, r28, r27 ; CHECK-PWR8-NEXT: sub r28, r28, r27 -; CHECK-PWR8-NEXT: rldicl r27, r12, 32, 56 +; CHECK-PWR8-NEXT: rldicl r27, r11, 32, 56 ; CHECK-PWR8-NEXT: clrlwi r27, r27, 24 ; CHECK-PWR8-NEXT: mtvsrd v4, r28 ; CHECK-PWR8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload @@ -782,28 +782,28 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: srawi r26, r27, 31 ; CHECK-PWR8-NEXT: xor r27, r27, r26 ; CHECK-PWR8-NEXT: sub r27, r27, r26 -; CHECK-PWR8-NEXT: rldicl r26, r12, 24, 56 +; CHECK-PWR8-NEXT: rldicl r26, r11, 24, 56 ; CHECK-PWR8-NEXT: clrlwi r26, r26, 24 ; CHECK-PWR8-NEXT: sub r26, r26, r25 ; CHECK-PWR8-NEXT: srawi r25, r26, 31 ; CHECK-PWR8-NEXT: xor r26, r26, r25 ; CHECK-PWR8-NEXT: sub r26, r26, r25 -; CHECK-PWR8-NEXT: rldicl r25, r12, 16, 56 -; CHECK-PWR8-NEXT: rldicl r12, r12, 8, 56 +; CHECK-PWR8-NEXT: rldicl r25, r11, 16, 56 +; CHECK-PWR8-NEXT: rldicl r11, r11, 8, 56 ; CHECK-PWR8-NEXT: clrlwi r25, r25, 24 -; CHECK-PWR8-NEXT: clrlwi r12, r12, 24 +; CHECK-PWR8-NEXT: clrlwi r11, r11, 24 ; CHECK-PWR8-NEXT: mtvsrd v5, r26 ; CHECK-PWR8-NEXT: ld r26, -48(r1) # 8-byte Folded Reload ; CHECK-PWR8-NEXT: sub r25, r25, r24 -; CHECK-PWR8-NEXT: sub r12, r12, r0 +; CHECK-PWR8-NEXT: sub r11, r11, r12 ; CHECK-PWR8-NEXT: srawi r24, r25, 31 -; CHECK-PWR8-NEXT: srawi r0, r12, 31 +; CHECK-PWR8-NEXT: srawi r12, r11, 31 ; CHECK-PWR8-NEXT: xor r25, r25, r24 -; CHECK-PWR8-NEXT: xor r12, r12, r0 +; CHECK-PWR8-NEXT: xor r11, r11, r12 ; CHECK-PWR8-NEXT: sub r25, r25, r24 -; CHECK-PWR8-NEXT: sub r12, r12, r0 +; CHECK-PWR8-NEXT: sub r11, r11, r12 ; CHECK-PWR8-NEXT: ld r24, -64(r1) # 8-byte Folded Reload -; CHECK-PWR8-NEXT: mtvsrd v0, r12 +; CHECK-PWR8-NEXT: mtvsrd v0, r11 ; CHECK-PWR8-NEXT: vmrghb v2, v3, v2 ; CHECK-PWR8-NEXT: mtvsrd v3, r29 ; CHECK-PWR8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload @@ -819,12 +819,12 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR8-NEXT: vmrglh v3, v5, v4 ; CHECK-PWR8-NEXT: xxmrglw vs0, v3, v2 ; CHECK-PWR8-NEXT: vmrghb v0, v1, v0 -; CHECK-PWR8-NEXT: mtvsrd v1, r8 +; CHECK-PWR8-NEXT: mtvsrd v1, r7 ; CHECK-PWR8-NEXT: vmrghb v1, v6, v1 -; CHECK-PWR8-NEXT: mtvsrd v6, r6 +; CHECK-PWR8-NEXT: mtvsrd v6, r4 ; CHECK-PWR8-NEXT: vmrglh v4, v1, v0 ; CHECK-PWR8-NEXT: vmrghb v6, v7, v6 -; CHECK-PWR8-NEXT: mtvsrd v7, r3 +; CHECK-PWR8-NEXT: mtvsrd v7, r5 ; CHECK-PWR8-NEXT: vmrghb v7, v8, v7 ; CHECK-PWR8-NEXT: vmrglh v5, v7, v6 ; CHECK-PWR8-NEXT: xxmrglw vs1, v5, v4 @@ -854,85 +854,85 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR7-NEXT: addi r3, r1, 320 ; CHECK-PWR7-NEXT: lbz r4, 304(r1) ; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: lbz r3, 320(r1) ; CHECK-PWR7-NEXT: lbz r5, 305(r1) ; CHECK-PWR7-NEXT: lbz r6, 321(r1) ; CHECK-PWR7-NEXT: lbz r7, 306(r1) ; CHECK-PWR7-NEXT: lbz r8, 322(r1) -; CHECK-PWR7-NEXT: lbz r0, 309(r1) -; CHECK-PWR7-NEXT: lbz r30, 325(r1) ; CHECK-PWR7-NEXT: lbz r9, 307(r1) ; CHECK-PWR7-NEXT: lbz r10, 323(r1) +; CHECK-PWR7-NEXT: lbz r0, 309(r1) +; CHECK-PWR7-NEXT: lbz r30, 325(r1) ; CHECK-PWR7-NEXT: lbz r29, 310(r1) ; CHECK-PWR7-NEXT: lbz r28, 326(r1) ; CHECK-PWR7-NEXT: lbz r11, 308(r1) ; CHECK-PWR7-NEXT: lbz r12, 324(r1) ; CHECK-PWR7-NEXT: lbz r27, 311(r1) ; CHECK-PWR7-NEXT: lbz r26, 327(r1) -; CHECK-PWR7-NEXT: sub r3, r4, r3 -; CHECK-PWR7-NEXT: sub r4, r5, r6 -; CHECK-PWR7-NEXT: sub r5, r7, r8 -; CHECK-PWR7-NEXT: sub r8, r0, r30 -; CHECK-PWR7-NEXT: sub r6, r9, r10 -; CHECK-PWR7-NEXT: sub r9, r29, r28 -; CHECK-PWR7-NEXT: srawi r0, r4, 31 -; CHECK-PWR7-NEXT: srawi r30, r5, 31 -; CHECK-PWR7-NEXT: srawi r29, r6, 31 -; CHECK-PWR7-NEXT: sub r7, r11, r12 -; CHECK-PWR7-NEXT: srawi r28, r7, 31 -; CHECK-PWR7-NEXT: sub r10, r27, r26 -; CHECK-PWR7-NEXT: srawi r27, r8, 31 -; CHECK-PWR7-NEXT: xor r4, r4, r0 -; CHECK-PWR7-NEXT: xor r5, r5, r30 -; CHECK-PWR7-NEXT: xor r6, r6, r29 -; CHECK-PWR7-NEXT: xor r7, r7, r28 -; CHECK-PWR7-NEXT: xor r8, r8, r27 -; CHECK-PWR7-NEXT: srawi r26, r9, 31 -; CHECK-PWR7-NEXT: sub r4, r4, r0 -; CHECK-PWR7-NEXT: sub r5, r5, r30 +; CHECK-PWR7-NEXT: lbz r25, 312(r1) +; CHECK-PWR7-NEXT: sub r5, r5, r6 +; CHECK-PWR7-NEXT: sub r6, r7, r8 +; CHECK-PWR7-NEXT: sub r7, r9, r10 +; CHECK-PWR7-NEXT: sub r9, r0, r30 +; CHECK-PWR7-NEXT: sub r10, r29, r28 +; CHECK-PWR7-NEXT: sub r8, r11, r12 +; CHECK-PWR7-NEXT: srawi r0, r5, 31 +; CHECK-PWR7-NEXT: srawi r30, r6, 31 +; CHECK-PWR7-NEXT: srawi r29, r7, 31 +; CHECK-PWR7-NEXT: srawi r28, r8, 31 +; CHECK-PWR7-NEXT: sub r11, r27, r26 +; CHECK-PWR7-NEXT: srawi r27, r9, 31 +; CHECK-PWR7-NEXT: lbz r24, 328(r1) +; CHECK-PWR7-NEXT: xor r5, r5, r0 +; CHECK-PWR7-NEXT: xor r6, r6, r30 +; CHECK-PWR7-NEXT: xor r7, r7, r29 +; CHECK-PWR7-NEXT: xor r8, r8, r28 +; CHECK-PWR7-NEXT: xor r9, r9, r27 +; CHECK-PWR7-NEXT: srawi r26, r10, 31 +; CHECK-PWR7-NEXT: sub r5, r5, r0 +; CHECK-PWR7-NEXT: sub r6, r6, r30 ; CHECK-PWR7-NEXT: lbz r0, 313(r1) ; CHECK-PWR7-NEXT: lbz r30, 329(r1) -; CHECK-PWR7-NEXT: sub r6, r6, r29 +; CHECK-PWR7-NEXT: sub r7, r7, r29 ; CHECK-PWR7-NEXT: lbz r29, 330(r1) -; CHECK-PWR7-NEXT: sub r7, r7, r28 +; CHECK-PWR7-NEXT: sub r8, r8, r28 ; CHECK-PWR7-NEXT: lbz r28, 331(r1) -; CHECK-PWR7-NEXT: sub r8, r8, r27 +; CHECK-PWR7-NEXT: sub r9, r9, r27 ; CHECK-PWR7-NEXT: lbz r27, 332(r1) -; CHECK-PWR7-NEXT: xor r9, r9, r26 -; CHECK-PWR7-NEXT: sub r9, r9, r26 +; CHECK-PWR7-NEXT: xor r10, r10, r26 +; CHECK-PWR7-NEXT: sub r10, r10, r26 ; CHECK-PWR7-NEXT: lbz r26, 333(r1) -; CHECK-PWR7-NEXT: lbz r25, 312(r1) -; CHECK-PWR7-NEXT: lbz r24, 328(r1) -; CHECK-PWR7-NEXT: sub r11, r25, r24 +; CHECK-PWR7-NEXT: sub r12, r25, r24 +; CHECK-PWR7-NEXT: srawi r25, r11, 31 +; CHECK-PWR7-NEXT: lbz r3, 320(r1) ; CHECK-PWR7-NEXT: sub r0, r0, r30 -; CHECK-PWR7-NEXT: srawi r25, r10, 31 -; CHECK-PWR7-NEXT: xor r10, r10, r25 -; CHECK-PWR7-NEXT: sub r10, r10, r25 +; CHECK-PWR7-NEXT: xor r11, r11, r25 +; CHECK-PWR7-NEXT: sub r11, r11, r25 ; CHECK-PWR7-NEXT: lbz r25, 334(r1) +; CHECK-PWR7-NEXT: sub r4, r4, r3 ; CHECK-PWR7-NEXT: srawi r30, r0, 31 -; CHECK-PWR7-NEXT: srawi r24, r11, 31 -; CHECK-PWR7-NEXT: xor r11, r11, r24 -; CHECK-PWR7-NEXT: sub r11, r11, r24 +; CHECK-PWR7-NEXT: srawi r24, r12, 31 +; CHECK-PWR7-NEXT: xor r12, r12, r24 +; CHECK-PWR7-NEXT: sub r12, r12, r24 ; CHECK-PWR7-NEXT: lbz r24, 335(r1) -; CHECK-PWR7-NEXT: srawi r12, r3, 31 -; CHECK-PWR7-NEXT: xor r3, r3, r12 +; CHECK-PWR7-NEXT: srawi r3, r4, 31 +; CHECK-PWR7-NEXT: xor r4, r4, r3 ; CHECK-PWR7-NEXT: xor r0, r0, r30 -; CHECK-PWR7-NEXT: sub r3, r3, r12 +; CHECK-PWR7-NEXT: sub r3, r4, r3 ; CHECK-PWR7-NEXT: stb r3, 48(r1) ; CHECK-PWR7-NEXT: addi r3, r1, 288 -; CHECK-PWR7-NEXT: stb r11, 176(r1) +; CHECK-PWR7-NEXT: stb r12, 176(r1) ; CHECK-PWR7-NEXT: sub r0, r0, r30 ; CHECK-PWR7-NEXT: lbz r30, 314(r1) -; CHECK-PWR7-NEXT: stb r10, 160(r1) +; CHECK-PWR7-NEXT: stb r11, 160(r1) ; CHECK-PWR7-NEXT: sub r30, r30, r29 ; CHECK-PWR7-NEXT: stb r0, 192(r1) -; CHECK-PWR7-NEXT: stb r9, 144(r1) -; CHECK-PWR7-NEXT: stb r8, 128(r1) -; CHECK-PWR7-NEXT: stb r7, 112(r1) -; CHECK-PWR7-NEXT: stb r6, 96(r1) -; CHECK-PWR7-NEXT: stb r5, 80(r1) +; CHECK-PWR7-NEXT: stb r10, 144(r1) +; CHECK-PWR7-NEXT: stb r9, 128(r1) +; CHECK-PWR7-NEXT: stb r8, 112(r1) +; CHECK-PWR7-NEXT: stb r7, 96(r1) +; CHECK-PWR7-NEXT: stb r6, 80(r1) ; CHECK-PWR7-NEXT: srawi r29, r30, 31 -; CHECK-PWR7-NEXT: stb r4, 64(r1) +; CHECK-PWR7-NEXT: stb r5, 64(r1) ; CHECK-PWR7-NEXT: xor r30, r30, r29 ; CHECK-PWR7-NEXT: sub r30, r30, r29 ; CHECK-PWR7-NEXT: lbz r29, 315(r1) diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll index 9fea8e5f8ab47..f699ea54192d8 100644 --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -756,7 +756,7 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: xxswapd 1, 32 ; CHECK-NEXT: xxswapd 6, 42 ; CHECK-NEXT: mffprd 5, 1 -; CHECK-NEXT: cmpld 5, 3 +; CHECK-NEXT: cmpld 6, 5, 3 ; CHECK-NEXT: mffprd 7, 6 ; CHECK-NEXT: xxswapd 3, 33 ; CHECK-NEXT: xxswapd 7, 43 @@ -765,9 +765,9 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: mffprd 6, 5 ; CHECK-NEXT: mffprd 7, 7 ; CHECK-NEXT: mfvsrd 5, 36 -; CHECK-NEXT: cmpld 1, 3, 4 +; CHECK-NEXT: cmpld 3, 4 ; CHECK-NEXT: mfvsrd 3, 34 -; CHECK-NEXT: cmpld 6, 7, 6 +; CHECK-NEXT: cmpld 1, 7, 6 ; CHECK-NEXT: mfvsrd 7, 32 ; CHECK-NEXT: mfvsrd 4, 35 ; CHECK-NEXT: mfvsrd 6, 37 @@ -775,34 +775,34 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { ; CHECK-NEXT: cmpd 2, 7, 3 ; CHECK-NEXT: mfvsrd 3, 33 ; CHECK-NEXT: crandc 21, 8, 30 -; CHECK-NEXT: crand 22, 30, 0 -; CHECK-NEXT: cmpld 3, 4 +; CHECK-NEXT: crand 22, 30, 24 +; CHECK-NEXT: cmpld 6, 3, 4 ; CHECK-NEXT: cmpd 7, 3, 4 ; CHECK-NEXT: mfvsrd 4, 42 ; CHECK-NEXT: sradi 3, 3, 63 ; CHECK-NEXT: mtocrf 32, 12 ; CHECK-NEXT: crnor 21, 22, 21 -; CHECK-NEXT: crandc 23, 28, 2 -; CHECK-NEXT: crand 25, 2, 4 +; CHECK-NEXT: crandc 23, 28, 26 +; CHECK-NEXT: crand 24, 26, 0 ; CHECK-NEXT: cmpld 4, 5 -; CHECK-NEXT: cmpd 1, 4, 5 +; CHECK-NEXT: cmpd 7, 4, 5 ; CHECK-NEXT: mfvsrd 5, 43 -; CHECK-NEXT: crnor 22, 25, 23 +; CHECK-NEXT: crnor 22, 24, 23 ; CHECK-NEXT: mtfprd 5, 3 ; CHECK-NEXT: sradi 4, 4, 63 ; CHECK-NEXT: mtfprd 6, 4 -; CHECK-NEXT: crandc 26, 4, 2 +; CHECK-NEXT: crandc 25, 28, 2 ; CHECK-NEXT: crand 20, 2, 20 ; CHECK-NEXT: cmpld 5, 6 -; CHECK-NEXT: cmpd 1, 5, 6 +; CHECK-NEXT: cmpd 7, 5, 6 ; CHECK-NEXT: mfvsrd 6, 38 ; CHECK-NEXT: sradi 5, 5, 63 -; CHECK-NEXT: crnor 20, 20, 26 +; CHECK-NEXT: crnor 20, 20, 25 ; CHECK-NEXT: mtfprd 7, 5 ; CHECK-NEXT: sradi 6, 6, 63 -; CHECK-NEXT: crandc 27, 4, 2 -; CHECK-NEXT: crand 24, 2, 24 -; CHECK-NEXT: crnor 23, 24, 27 +; CHECK-NEXT: crandc 26, 28, 2 +; CHECK-NEXT: crand 27, 2, 4 +; CHECK-NEXT: crnor 23, 27, 26 ; CHECK-NEXT: mtfprd 0, 6 ; CHECK-NEXT: mfvsrd 6, 39 ; CHECK-NEXT: sradi 6, 6, 63 diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll index a263b56ce70ce..df55b92997765 100644 --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -647,27 +647,27 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; ; P8BE-LABEL: combine_srem_sdiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r3, v2 -; P8BE-NEXT: lis r4, -21386 -; P8BE-NEXT: ori r4, r4, 37253 -; P8BE-NEXT: clrldi r5, r3, 48 -; P8BE-NEXT: rldicl r6, r3, 48, 48 -; P8BE-NEXT: rldicl r7, r3, 32, 48 -; P8BE-NEXT: rldicl r3, r3, 16, 48 -; P8BE-NEXT: extsh r8, r5 +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r5, -21386 +; P8BE-NEXT: ori r5, r5, 37253 +; P8BE-NEXT: clrldi r3, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: extsh r8, r3 ; P8BE-NEXT: extsh r9, r6 ; P8BE-NEXT: extsh r10, r7 -; P8BE-NEXT: extsh r3, r3 -; P8BE-NEXT: mulhw r11, r8, r4 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: mulhw r11, r8, r5 ; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: mulhw r11, r9, r4 +; P8BE-NEXT: mulhw r11, r9, r5 ; P8BE-NEXT: add r9, r11, r9 -; P8BE-NEXT: mulhw r11, r10, r4 -; P8BE-NEXT: mulhw r4, r3, r4 +; P8BE-NEXT: mulhw r11, r10, r5 +; P8BE-NEXT: mulhw r5, r4, r5 ; P8BE-NEXT: add r10, r11, r10 ; P8BE-NEXT: srwi r11, r8, 31 ; P8BE-NEXT: srawi r8, r8, 6 -; P8BE-NEXT: add r4, r4, r3 +; P8BE-NEXT: add r5, r5, r4 ; P8BE-NEXT: add r8, r8, r11 ; P8BE-NEXT: srwi r11, r9, 31 ; P8BE-NEXT: srawi r9, r9, 6 @@ -676,30 +676,30 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; P8BE-NEXT: srawi r10, r10, 6 ; P8BE-NEXT: mtvsrwz v3, r8 ; P8BE-NEXT: add r10, r10, r11 -; P8BE-NEXT: srwi r11, r4, 31 -; P8BE-NEXT: srawi r4, r4, 6 +; P8BE-NEXT: srwi r11, r5, 31 +; P8BE-NEXT: srawi r5, r5, 6 ; P8BE-NEXT: mtvsrwz v4, r9 -; P8BE-NEXT: add r4, r4, r11 +; P8BE-NEXT: add r5, r5, r11 ; P8BE-NEXT: mulli r11, r8, 95 -; P8BE-NEXT: sub r5, r5, r11 +; P8BE-NEXT: sub r3, r3, r11 ; P8BE-NEXT: mulli r11, r9, 95 -; P8BE-NEXT: mtvsrwz v5, r5 +; P8BE-NEXT: mtvsrwz v5, r3 ; P8BE-NEXT: sub r6, r6, r11 ; P8BE-NEXT: mulli r11, r10, 95 ; P8BE-NEXT: mtvsrwz v0, r6 ; P8BE-NEXT: sub r7, r7, r11 -; P8BE-NEXT: mulli r11, r4, 95 +; P8BE-NEXT: mulli r11, r5, 95 ; P8BE-NEXT: mtvsrwz v1, r7 -; P8BE-NEXT: sub r3, r3, r11 +; P8BE-NEXT: sub r4, r4, r11 ; P8BE-NEXT: addis r11, r2, .LCPI2_0@toc@ha ; P8BE-NEXT: addi r11, r11, .LCPI2_0@toc@l ; P8BE-NEXT: lxvw4x v2, 0, r11 ; P8BE-NEXT: vperm v5, v0, v5, v2 -; P8BE-NEXT: mtvsrwz v0, r3 +; P8BE-NEXT: mtvsrwz v0, r4 ; P8BE-NEXT: vperm v3, v4, v3, v2 ; P8BE-NEXT: mtvsrwz v4, r10 ; P8BE-NEXT: vperm v0, v0, v1, v2 -; P8BE-NEXT: mtvsrwz v1, r4 +; P8BE-NEXT: mtvsrwz v1, r5 ; P8BE-NEXT: vperm v2, v1, v4, v2 ; P8BE-NEXT: xxmrghw v4, v0, v5 ; P8BE-NEXT: xxmrghw v2, v2, v3 diff --git a/llvm/test/CodeGen/PowerPC/sub-of-not.ll b/llvm/test/CodeGen/PowerPC/sub-of-not.ll index d2b55aaf7ac83..9cd2ec5510886 100644 --- a/llvm/test/CodeGen/PowerPC/sub-of-not.ll +++ b/llvm/test/CodeGen/PowerPC/sub-of-not.ll @@ -65,88 +65,89 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32: # %bb.0: ; PPC32-NEXT: stwu 1, -64(1) ; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill -; PPC32-NEXT: lbz 21, 123(1) ; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: add 7, 21, 7 -; PPC32-NEXT: lbz 23, 115(1) +; PPC32-NEXT: lbz 4, 115(1) ; PPC32-NEXT: lbz 22, 119(1) -; PPC32-NEXT: lbz 21, 135(1) -; PPC32-NEXT: add 5, 23, 5 -; PPC32-NEXT: lbz 23, 127(1) -; PPC32-NEXT: add 6, 22, 6 +; PPC32-NEXT: lbz 21, 123(1) +; PPC32-NEXT: add 4, 4, 5 +; PPC32-NEXT: add 5, 22, 6 ; PPC32-NEXT: lbz 22, 131(1) -; PPC32-NEXT: add 10, 21, 10 -; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; PPC32-NEXT: add 8, 23, 8 -; PPC32-NEXT: lbz 26, 83(1) +; PPC32-NEXT: add 6, 21, 7 +; PPC32-NEXT: lbz 21, 135(1) +; PPC32-NEXT: addi 6, 6, 1 +; PPC32-NEXT: stw 20, 16(1) # 4-byte Folded Spill ; PPC32-NEXT: add 9, 22, 9 +; PPC32-NEXT: lbz 20, 127(1) +; PPC32-NEXT: add 10, 21, 10 +; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: addi 5, 5, 1 +; PPC32-NEXT: lbz 25, 83(1) +; PPC32-NEXT: add 7, 20, 8 ; PPC32-NEXT: lbz 21, 147(1) +; PPC32-NEXT: addi 7, 7, 1 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: add 26, 21, 26 -; PPC32-NEXT: lbz 25, 79(1) -; PPC32-NEXT: lbz 24, 75(1) -; PPC32-NEXT: lbz 23, 139(1) +; PPC32-NEXT: addi 4, 4, 1 +; PPC32-NEXT: lbz 24, 79(1) +; PPC32-NEXT: add 25, 21, 25 ; PPC32-NEXT: lbz 22, 143(1) -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: add 24, 23, 24 -; PPC32-NEXT: lbz 29, 95(1) -; PPC32-NEXT: add 25, 22, 25 +; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 23, 75(1) +; PPC32-NEXT: add 24, 22, 24 +; PPC32-NEXT: lbz 8, 139(1) +; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 28, 95(1) +; PPC32-NEXT: add 8, 8, 23 ; PPC32-NEXT: lbz 21, 159(1) +; PPC32-NEXT: addi 8, 8, 1 ; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; PPC32-NEXT: add 29, 21, 29 -; PPC32-NEXT: lbz 28, 91(1) -; PPC32-NEXT: lbz 27, 87(1) -; PPC32-NEXT: lbz 23, 151(1) +; PPC32-NEXT: lbz 27, 91(1) +; PPC32-NEXT: add 28, 21, 28 ; PPC32-NEXT: lbz 22, 155(1) -; PPC32-NEXT: lbz 4, 111(1) -; PPC32-NEXT: add 27, 23, 27 +; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 26, 87(1) +; PPC32-NEXT: add 27, 22, 27 +; PPC32-NEXT: lbz 23, 151(1) +; PPC32-NEXT: lbz 11, 111(1) ; PPC32-NEXT: lbz 21, 175(1) -; PPC32-NEXT: add 28, 22, 28 -; PPC32-NEXT: lbz 11, 107(1) -; PPC32-NEXT: lbz 12, 171(1) -; PPC32-NEXT: add 4, 21, 4 +; PPC32-NEXT: add 26, 23, 26 +; PPC32-NEXT: lbz 12, 107(1) +; PPC32-NEXT: lbz 0, 171(1) +; PPC32-NEXT: add 11, 21, 11 ; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: addi 4, 4, 1 -; PPC32-NEXT: lbz 0, 103(1) -; PPC32-NEXT: add 11, 12, 11 -; PPC32-NEXT: lbz 30, 99(1) -; PPC32-NEXT: lbz 23, 163(1) +; PPC32-NEXT: addi 11, 11, 1 +; PPC32-NEXT: lbz 30, 103(1) +; PPC32-NEXT: add 12, 0, 12 ; PPC32-NEXT: lbz 22, 167(1) -; PPC32-NEXT: add 30, 23, 30 -; PPC32-NEXT: stb 4, 15(3) -; PPC32-NEXT: add 23, 22, 0 -; PPC32-NEXT: addi 4, 11, 1 -; PPC32-NEXT: stb 4, 14(3) -; PPC32-NEXT: addi 4, 23, 1 -; PPC32-NEXT: stb 4, 13(3) -; PPC32-NEXT: addi 4, 30, 1 -; PPC32-NEXT: stb 4, 12(3) -; PPC32-NEXT: addi 4, 29, 1 -; PPC32-NEXT: stb 4, 11(3) -; PPC32-NEXT: addi 4, 28, 1 -; PPC32-NEXT: stb 4, 10(3) -; PPC32-NEXT: addi 4, 27, 1 -; PPC32-NEXT: stb 4, 9(3) -; PPC32-NEXT: addi 4, 26, 1 -; PPC32-NEXT: stb 4, 8(3) -; PPC32-NEXT: addi 4, 25, 1 -; PPC32-NEXT: stb 4, 7(3) -; PPC32-NEXT: addi 4, 24, 1 -; PPC32-NEXT: stb 4, 6(3) -; PPC32-NEXT: addi 4, 10, 1 -; PPC32-NEXT: stb 4, 5(3) -; PPC32-NEXT: addi 4, 9, 1 -; PPC32-NEXT: stb 4, 4(3) -; PPC32-NEXT: addi 4, 8, 1 -; PPC32-NEXT: stb 4, 3(3) -; PPC32-NEXT: addi 4, 7, 1 -; PPC32-NEXT: stb 4, 2(3) -; PPC32-NEXT: addi 4, 6, 1 -; PPC32-NEXT: stb 4, 1(3) -; PPC32-NEXT: addi 4, 5, 1 +; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: lbz 29, 99(1) +; PPC32-NEXT: add 30, 22, 30 +; PPC32-NEXT: lbz 23, 163(1) +; PPC32-NEXT: stb 11, 15(3) +; PPC32-NEXT: addi 11, 12, 1 +; PPC32-NEXT: add 29, 23, 29 +; PPC32-NEXT: stb 11, 14(3) +; PPC32-NEXT: addi 11, 30, 1 +; PPC32-NEXT: stb 11, 13(3) +; PPC32-NEXT: addi 11, 29, 1 +; PPC32-NEXT: stb 11, 12(3) +; PPC32-NEXT: addi 11, 28, 1 +; PPC32-NEXT: stb 11, 11(3) +; PPC32-NEXT: addi 11, 27, 1 +; PPC32-NEXT: stb 11, 10(3) +; PPC32-NEXT: addi 11, 26, 1 +; PPC32-NEXT: stb 11, 9(3) +; PPC32-NEXT: addi 11, 25, 1 +; PPC32-NEXT: stb 8, 6(3) +; PPC32-NEXT: addi 8, 10, 1 +; PPC32-NEXT: stb 11, 8(3) +; PPC32-NEXT: addi 11, 24, 1 +; PPC32-NEXT: stb 8, 5(3) +; PPC32-NEXT: addi 8, 9, 1 +; PPC32-NEXT: stb 11, 7(3) +; PPC32-NEXT: stb 8, 4(3) +; PPC32-NEXT: stb 7, 3(3) +; PPC32-NEXT: stb 6, 2(3) +; PPC32-NEXT: stb 5, 1(3) ; PPC32-NEXT: stb 4, 0(3) ; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload @@ -158,6 +159,7 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload ; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 20, 16(1) # 4-byte Folded Reload ; PPC32-NEXT: addi 1, 1, 64 ; PPC32-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll b/llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll index abda0c897a7cf..43aeb3655c5de 100644 --- a/llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll +++ b/llvm/test/CodeGen/PowerPC/tocSaveInPrologue.ll @@ -15,13 +15,13 @@ define dso_local void @test(ptr nocapture %fp, i32 signext %Arg, i32 signext %Le ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: stdu r1, -64(r1) ; CHECK-NEXT: cmpwi r5, 1 -; CHECK-NEXT: mr r30, r4 +; CHECK-NEXT: mr r29, r4 ; CHECK-NEXT: std r2, 24(r1) ; CHECK-NEXT: std r0, 80(r1) -; CHECK-NEXT: mr r29, r3 +; CHECK-NEXT: mr r30, r3 ; CHECK-NEXT: bc 12, lt, .LBB0_4 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: cmpwi r30, 11 +; CHECK-NEXT: cmpwi r29, 11 ; CHECK-NEXT: bc 12, lt, .LBB0_4 ; CHECK-NEXT: # %bb.2: # %for.body.us.preheader ; CHECK-NEXT: addi r3, r5, -1 @@ -30,18 +30,18 @@ define dso_local void @test(ptr nocapture %fp, i32 signext %Arg, i32 signext %Le ; CHECK-NEXT: .p2align 5 ; CHECK-NEXT: .LBB0_3: # %for.body.us ; CHECK-NEXT: # -; CHECK-NEXT: mtctr r29 -; CHECK-NEXT: mr r3, r30 -; CHECK-NEXT: mr r12, r29 +; CHECK-NEXT: mtctr r30 +; CHECK-NEXT: mr r3, r29 +; CHECK-NEXT: mr r12, r30 ; CHECK-NEXT: bctrl ; CHECK-NEXT: ld 2, 24(r1) ; CHECK-NEXT: addi r28, r28, -1 ; CHECK-NEXT: cmpldi r28, 0 ; CHECK-NEXT: bc 12, gt, .LBB0_3 ; CHECK-NEXT: .LBB0_4: # %for.cond.cleanup -; CHECK-NEXT: mtctr r29 -; CHECK-NEXT: mr r3, r30 -; CHECK-NEXT: mr r12, r29 +; CHECK-NEXT: mtctr r30 +; CHECK-NEXT: mr r3, r29 +; CHECK-NEXT: mr r12, r30 ; CHECK-NEXT: bctrl ; CHECK-NEXT: ld 2, 24(r1) ; CHECK-NEXT: addi r1, r1, 64 diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll index e2bbb04cf532a..48098e3a277c1 100644 --- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll @@ -36,68 +36,68 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill ; PPC32-NEXT: mulhwu. 26, 7, 6 ; PPC32-NEXT: mcrf 1, 0 +; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill ; PPC32-NEXT: mfcr 12 ; PPC32-NEXT: cmpwi 7, 5, 0 ; PPC32-NEXT: cmpwi 2, 7, 0 -; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill ; PPC32-NEXT: mulhwu. 26, 5, 8 ; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill ; PPC32-NEXT: crnor 20, 30, 10 -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill ; PPC32-NEXT: cmpwi 7, 9, 0 ; PPC32-NEXT: mulhwu. 26, 3, 10 ; PPC32-NEXT: mcrf 6, 0 +; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill ; PPC32-NEXT: cmpwi 2, 3, 0 ; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill ; PPC32-NEXT: crnor 21, 30, 10 -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill ; PPC32-NEXT: mulhwu. 26, 9, 4 -; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill ; PPC32-NEXT: crorc 20, 20, 6 -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill ; PPC32-NEXT: crorc 21, 21, 26 -; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu 0, 6, 10 +; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu 30, 6, 10 ; PPC32-NEXT: stw 12, 20(1) ; PPC32-NEXT: crorc 20, 20, 22 ; PPC32-NEXT: crorc 21, 21, 2 ; PPC32-NEXT: li 11, 0 ; PPC32-NEXT: mullw 26, 5, 10 -; PPC32-NEXT: addc 0, 26, 0 +; PPC32-NEXT: addc 30, 26, 30 ; PPC32-NEXT: mulhwu 29, 5, 10 ; PPC32-NEXT: addze 29, 29 ; PPC32-NEXT: mullw 23, 5, 8 ; PPC32-NEXT: mullw 22, 7, 6 -; PPC32-NEXT: mulhwu 30, 6, 9 +; PPC32-NEXT: mulhwu 0, 6, 9 ; PPC32-NEXT: mulhwu 12, 5, 9 -; PPC32-NEXT: mulhwu 28, 8, 6 +; PPC32-NEXT: mulhwu 27, 8, 6 ; PPC32-NEXT: mullw 25, 6, 9 ; PPC32-NEXT: mullw 24, 5, 9 ; PPC32-NEXT: mullw 5, 9, 4 ; PPC32-NEXT: add 9, 22, 23 -; PPC32-NEXT: add 9, 28, 9 -; PPC32-NEXT: cmplw 1, 9, 28 +; PPC32-NEXT: add 9, 27, 9 +; PPC32-NEXT: cmplw 1, 9, 27 ; PPC32-NEXT: cror 20, 20, 4 ; PPC32-NEXT: mullw 23, 3, 10 ; PPC32-NEXT: add 26, 23, 5 -; PPC32-NEXT: addc 5, 25, 0 -; PPC32-NEXT: addze 30, 30 +; PPC32-NEXT: addc 5, 25, 30 +; PPC32-NEXT: addze 0, 0 ; PPC32-NEXT: or. 3, 4, 3 -; PPC32-NEXT: mulhwu 27, 4, 10 +; PPC32-NEXT: mulhwu 28, 4, 10 ; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: addc 3, 29, 30 -; PPC32-NEXT: add 26, 27, 26 -; PPC32-NEXT: cmplw 6, 26, 27 +; PPC32-NEXT: addc 3, 29, 0 +; PPC32-NEXT: add 26, 28, 26 +; PPC32-NEXT: cmplw 6, 26, 28 ; PPC32-NEXT: cror 21, 21, 24 -; PPC32-NEXT: mullw 0, 4, 10 +; PPC32-NEXT: mullw 30, 4, 10 ; PPC32-NEXT: or. 4, 8, 7 ; PPC32-NEXT: addze 4, 11 ; PPC32-NEXT: addc 7, 24, 3 ; PPC32-NEXT: crnor 22, 2, 6 -; PPC32-NEXT: mullw 28, 8, 6 +; PPC32-NEXT: mullw 27, 8, 6 ; PPC32-NEXT: adde 8, 12, 4 -; PPC32-NEXT: addc 3, 0, 28 +; PPC32-NEXT: addc 3, 30, 27 ; PPC32-NEXT: adde 9, 26, 9 ; PPC32-NEXT: addc 4, 7, 3 ; PPC32-NEXT: adde 3, 8, 9 diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll index 98314a02c23fe..a2ad2946cc8ec 100644 --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -897,31 +897,31 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; P8LE-NEXT: mfvsrd r6, v2 ; P8LE-NEXT: mfvsrd r8, v3 ; P8LE-NEXT: ori r3, r3, 51289 -; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: ori r5, r5, 42889 -; P8LE-NEXT: rldic r3, r3, 36, 1 +; P8LE-NEXT: rldic r4, r3, 36, 1 +; P8LE-NEXT: mffprd r3, f0 ; P8LE-NEXT: rldic r5, r5, 35, 1 ; P8LE-NEXT: rldicl r7, r6, 63, 1 -; P8LE-NEXT: oris r3, r3, 45590 +; P8LE-NEXT: oris r4, r4, 45590 ; P8LE-NEXT: oris r5, r5, 1603 -; P8LE-NEXT: ori r3, r3, 17097 +; P8LE-NEXT: ori r4, r4, 17097 ; P8LE-NEXT: ori r5, r5, 21445 -; P8LE-NEXT: mulhdu r3, r4, r3 +; P8LE-NEXT: mulhdu r4, r3, r4 ; P8LE-NEXT: mulhdu r5, r7, r5 -; P8LE-NEXT: sub r7, r4, r3 +; P8LE-NEXT: sub r7, r3, r4 ; P8LE-NEXT: rldicl r5, r5, 57, 7 ; P8LE-NEXT: rldicl r7, r7, 63, 1 ; P8LE-NEXT: mulli r5, r5, 654 -; P8LE-NEXT: add r3, r7, r3 +; P8LE-NEXT: add r4, r7, r4 ; P8LE-NEXT: lis r7, -16037 ; P8LE-NEXT: ori r7, r7, 28749 -; P8LE-NEXT: rldicl r3, r3, 60, 4 +; P8LE-NEXT: rldicl r4, r4, 60, 4 ; P8LE-NEXT: sub r5, r6, r5 ; P8LE-NEXT: rldic r7, r7, 32, 0 -; P8LE-NEXT: mulli r3, r3, 23 +; P8LE-NEXT: mulli r4, r4, 23 ; P8LE-NEXT: oris r7, r7, 52170 ; P8LE-NEXT: ori r7, r7, 12109 -; P8LE-NEXT: sub r3, r4, r3 +; P8LE-NEXT: sub r3, r3, r4 ; P8LE-NEXT: mulhdu r7, r8, r7 ; P8LE-NEXT: mtfprd f1, r3 ; P8LE-NEXT: li r3, 0 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll index 07a8fb06caa11..cc38f921b117b 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i16_elts.ll @@ -169,50 +169,50 @@ define <8 x i16> @test8elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxswapd v3, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xxswapd v2, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: vmrghh v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghh v2, v2, v5 +; CHECK-P8-NEXT: vmrghh v3, v3, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghh v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 -; CHECK-P8-NEXT: vmrghh v3, v3, v0 -; CHECK-P8-NEXT: xxmrglw vs1, v3, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v3, v4 +; CHECK-P8-NEXT: vmrghh v2, v2, v0 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -329,43 +329,43 @@ entry: define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r4 ; CHECK-P8-NEXT: li r6, 32 ; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs2, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 ; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 -; CHECK-P8-NEXT: xxswapd v3, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 -; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxswapd v3, vs1 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs3 -; CHECK-P8-NEXT: xxswapd v2, vs2 -; CHECK-P8-NEXT: xxswapd v4, vs6 -; CHECK-P8-NEXT: xxswapd v5, vs8 -; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: mtvsrd v0, r4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, vs2 +; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f2, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xscvspdpn f2, vs3 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxswapd v5, vs6 +; CHECK-P8-NEXT: xxswapd v4, vs8 +; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v4, v4, 3 -; CHECK-P8-NEXT: xxsldwi vs9, v4, v4, 1 -; CHECK-P8-NEXT: xxsldwi vs10, v5, v5, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v5, v5, 1 -; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: mffprwz r4, f2 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: vmrghh v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 @@ -388,7 +388,7 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: vmrghh v1, v1, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v2, v2, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 @@ -398,7 +398,7 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v6, v6, v7 @@ -407,16 +407,16 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xxmrglw vs1, v2, v1 -; CHECK-P8-NEXT: vmrghh v4, v4, v7 +; CHECK-P8-NEXT: vmrghh v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v5 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 -; CHECK-P8-NEXT: xxmrglw vs2, v4, v6 -; CHECK-P8-NEXT: mtvsrd v5, r4 +; CHECK-P8-NEXT: xxmrglw vs2, v5, v6 +; CHECK-P8-NEXT: mtvsrd v4, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: vmrghh v7, v8, v7 @@ -424,8 +424,8 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-P8-NEXT: xxmrglw vs0, v3, v0 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: vmrghh v5, v5, v8 -; CHECK-P8-NEXT: xxmrglw vs3, v5, v7 +; CHECK-P8-NEXT: vmrghh v4, v4, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v4, v7 ; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 @@ -534,38 +534,38 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs2, 16(r4) -; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: lxv vs3, 16(r4) +; CHECK-BE-NEXT: lxv vs2, 0(r4) ; CHECK-BE-NEXT: addis r5, r2, .LCPI3_0@toc@ha -; CHECK-BE-NEXT: lxv vs0, 48(r4) +; CHECK-BE-NEXT: lxv vs1, 48(r4) ; CHECK-BE-NEXT: addi r5, r5, .LCPI3_0@toc@l -; CHECK-BE-NEXT: lxv vs3, 0(r5) -; CHECK-BE-NEXT: xscvspdpn f6, vs2 -; CHECK-BE-NEXT: xxsldwi vs4, vs2, vs2, 3 -; CHECK-BE-NEXT: xscvspdpn f9, vs1 -; CHECK-BE-NEXT: xxswapd vs5, vs2 +; CHECK-BE-NEXT: lxv vs0, 0(r5) +; CHECK-BE-NEXT: xscvspdpn f6, vs3 +; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-BE-NEXT: xscvspdpn f9, vs2 +; CHECK-BE-NEXT: xxswapd vs5, vs3 +; CHECK-BE-NEXT: xxsldwi vs3, vs3, vs3, 1 +; CHECK-BE-NEXT: xxsldwi vs7, vs2, vs2, 3 +; CHECK-BE-NEXT: xxswapd vs8, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-BE-NEXT: xxsldwi vs7, vs1, vs1, 3 -; CHECK-BE-NEXT: xxswapd vs8, vs1 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 -; CHECK-BE-NEXT: xxsldwi vs10, vs0, vs0, 3 -; CHECK-BE-NEXT: xxswapd vs11, vs0 +; CHECK-BE-NEXT: xxsldwi vs10, vs1, vs1, 3 +; CHECK-BE-NEXT: xxswapd vs11, vs1 ; CHECK-BE-NEXT: xscvdpsxws f6, f6 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xscvdpsxws f9, f9 ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvspdpn f10, vs10 ; CHECK-BE-NEXT: xscvspdpn f11, vs11 ; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: xscvdpsxws f8, f8 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: xscvdpsxws f10, f10 ; CHECK-BE-NEXT: xscvdpsxws f11, f11 ; CHECK-BE-NEXT: mffprwz r5, f6 @@ -576,50 +576,50 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-BE-NEXT: mtfprwz f4, r5 ; CHECK-BE-NEXT: mffprwz r5, f5 ; CHECK-BE-NEXT: mtfprwz f5, r5 -; CHECK-BE-NEXT: mffprwz r5, f2 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 -; CHECK-BE-NEXT: xscvspdpn f5, vs0 -; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-BE-NEXT: mtfprwz f2, r5 +; CHECK-BE-NEXT: mffprwz r5, f3 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 +; CHECK-BE-NEXT: xscvspdpn f5, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: mtfprwz f3, r5 ; CHECK-BE-NEXT: mffprwz r5, f7 ; CHECK-BE-NEXT: mtfprwz f7, r5 ; CHECK-BE-NEXT: mffprwz r5, f8 -; CHECK-BE-NEXT: xxperm vs2, vs6, vs3 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 +; CHECK-BE-NEXT: xxperm vs3, vs6, vs0 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 ; CHECK-BE-NEXT: mtfprwz f8, r5 -; CHECK-BE-NEXT: mffprwz r5, f1 -; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs4 +; CHECK-BE-NEXT: mffprwz r5, f2 +; CHECK-BE-NEXT: xxmrghw vs3, vs3, vs4 ; CHECK-BE-NEXT: lxv vs4, 32(r4) -; CHECK-BE-NEXT: xscvdpsxws f0, f0 -; CHECK-BE-NEXT: mtfprwz f1, r5 -; CHECK-BE-NEXT: xxperm vs7, vs8, vs3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: mtfprwz f2, r5 +; CHECK-BE-NEXT: xxperm vs7, vs8, vs0 ; CHECK-BE-NEXT: mffprwz r5, f10 -; CHECK-BE-NEXT: xxperm vs1, vs9, vs3 +; CHECK-BE-NEXT: xxperm vs2, vs9, vs0 ; CHECK-BE-NEXT: mtfprwz f10, r5 ; CHECK-BE-NEXT: mffprwz r5, f11 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f11, r5 -; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs7 +; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs7 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs10, vs11, vs3 -; CHECK-BE-NEXT: mffprwz r4, f0 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: xxsldwi vs2, vs4, vs4, 3 -; CHECK-BE-NEXT: mtfprwz f0, r4 -; CHECK-BE-NEXT: xxperm vs0, vs5, vs3 +; CHECK-BE-NEXT: xxperm vs10, vs11, vs0 +; CHECK-BE-NEXT: mffprwz r4, f1 +; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 +; CHECK-BE-NEXT: xxsldwi vs3, vs4, vs4, 3 +; CHECK-BE-NEXT: mtfprwz f1, r4 +; CHECK-BE-NEXT: xxperm vs1, vs5, vs0 ; CHECK-BE-NEXT: xxswapd vs5, vs4 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: stxv vs1, 0(r3) +; CHECK-BE-NEXT: xscvspdpn f3, vs3 +; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: xxmrghw vs0, vs0, vs10 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs10 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: mffprwz r4, f2 -; CHECK-BE-NEXT: mtfprwz f2, r4 +; CHECK-BE-NEXT: mffprwz r4, f3 +; CHECK-BE-NEXT: mtfprwz f3, r4 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs2, vs5, vs3 +; CHECK-BE-NEXT: xxperm vs3, vs5, vs0 ; CHECK-BE-NEXT: xscvspdpn f5, vs4 ; CHECK-BE-NEXT: xxsldwi vs4, vs4, vs4, 1 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 @@ -629,9 +629,9 @@ define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr n ; CHECK-BE-NEXT: mtfprwz f5, r4 ; CHECK-BE-NEXT: mffprwz r4, f4 ; CHECK-BE-NEXT: mtfprwz f4, r4 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 -; CHECK-BE-NEXT: xxmrghw vs2, vs4, vs2 -; CHECK-BE-NEXT: xxmrghd vs0, vs2, vs0 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 +; CHECK-BE-NEXT: xxmrghw vs0, vs4, vs3 +; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: blr entry: @@ -801,50 +801,50 @@ define <8 x i16> @test8elt_signed(ptr nocapture readonly) local_unnamed_addr #2 ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxswapd v3, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xxswapd v2, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: vmrghh v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghh v2, v2, v5 +; CHECK-P8-NEXT: vmrghh v3, v3, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghh v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 -; CHECK-P8-NEXT: vmrghh v3, v3, v0 -; CHECK-P8-NEXT: xxmrglw vs1, v3, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v3, v4 +; CHECK-P8-NEXT: vmrghh v2, v2, v0 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -961,43 +961,43 @@ entry: define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r4 ; CHECK-P8-NEXT: li r6, 32 ; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs2, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 ; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 -; CHECK-P8-NEXT: xxswapd v3, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 -; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxswapd v3, vs1 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs3 -; CHECK-P8-NEXT: xxswapd v2, vs2 -; CHECK-P8-NEXT: xxswapd v4, vs6 -; CHECK-P8-NEXT: xxswapd v5, vs8 -; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: mtvsrd v0, r4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, vs2 +; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f2, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xscvspdpn f2, vs3 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxswapd v5, vs6 +; CHECK-P8-NEXT: xxswapd v4, vs8 +; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v4, v4, 3 -; CHECK-P8-NEXT: xxsldwi vs9, v4, v4, 1 -; CHECK-P8-NEXT: xxsldwi vs10, v5, v5, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v5, v5, 1 -; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: mffprwz r4, f2 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: vmrghh v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 @@ -1020,7 +1020,7 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: vmrghh v1, v1, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v2, v2, v6 ; CHECK-P8-NEXT: mtvsrd v6, r4 @@ -1030,7 +1030,7 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghh v6, v6, v7 @@ -1039,16 +1039,16 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: xxmrglw vs1, v2, v1 -; CHECK-P8-NEXT: vmrghh v4, v4, v7 +; CHECK-P8-NEXT: vmrghh v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v5 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 -; CHECK-P8-NEXT: xxmrglw vs2, v4, v6 -; CHECK-P8-NEXT: mtvsrd v5, r4 +; CHECK-P8-NEXT: xxmrglw vs2, v5, v6 +; CHECK-P8-NEXT: mtvsrd v4, r4 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r4, f0 ; CHECK-P8-NEXT: vmrghh v7, v8, v7 @@ -1056,8 +1056,8 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-P8-NEXT: xxmrglw vs0, v3, v0 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: vmrghh v5, v5, v8 -; CHECK-P8-NEXT: xxmrglw vs3, v5, v7 +; CHECK-P8-NEXT: vmrghh v4, v4, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v4, v7 ; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 @@ -1166,38 +1166,38 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs2, 16(r4) -; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: lxv vs3, 16(r4) +; CHECK-BE-NEXT: lxv vs2, 0(r4) ; CHECK-BE-NEXT: addis r5, r2, .LCPI7_0@toc@ha -; CHECK-BE-NEXT: lxv vs0, 48(r4) +; CHECK-BE-NEXT: lxv vs1, 48(r4) ; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l -; CHECK-BE-NEXT: lxv vs3, 0(r5) -; CHECK-BE-NEXT: xscvspdpn f6, vs2 -; CHECK-BE-NEXT: xxsldwi vs4, vs2, vs2, 3 -; CHECK-BE-NEXT: xscvspdpn f9, vs1 -; CHECK-BE-NEXT: xxswapd vs5, vs2 +; CHECK-BE-NEXT: lxv vs0, 0(r5) +; CHECK-BE-NEXT: xscvspdpn f6, vs3 +; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3 +; CHECK-BE-NEXT: xscvspdpn f9, vs2 +; CHECK-BE-NEXT: xxswapd vs5, vs3 +; CHECK-BE-NEXT: xxsldwi vs3, vs3, vs3, 1 +; CHECK-BE-NEXT: xxsldwi vs7, vs2, vs2, 3 +; CHECK-BE-NEXT: xxswapd vs8, vs2 ; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-BE-NEXT: xxsldwi vs7, vs1, vs1, 3 -; CHECK-BE-NEXT: xxswapd vs8, vs1 -; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 -; CHECK-BE-NEXT: xxsldwi vs10, vs0, vs0, 3 -; CHECK-BE-NEXT: xxswapd vs11, vs0 +; CHECK-BE-NEXT: xxsldwi vs10, vs1, vs1, 3 +; CHECK-BE-NEXT: xxswapd vs11, vs1 ; CHECK-BE-NEXT: xscvdpsxws f6, f6 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 ; CHECK-BE-NEXT: xscvdpsxws f9, f9 ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: xscvspdpn f7, vs7 ; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 +; CHECK-BE-NEXT: xscvspdpn f2, vs2 ; CHECK-BE-NEXT: xscvspdpn f10, vs10 ; CHECK-BE-NEXT: xscvspdpn f11, vs11 ; CHECK-BE-NEXT: xscvdpsxws f4, f4 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 ; CHECK-BE-NEXT: xscvdpsxws f7, f7 ; CHECK-BE-NEXT: xscvdpsxws f8, f8 -; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: xscvdpsxws f2, f2 ; CHECK-BE-NEXT: xscvdpsxws f10, f10 ; CHECK-BE-NEXT: xscvdpsxws f11, f11 ; CHECK-BE-NEXT: mffprwz r5, f6 @@ -1208,50 +1208,50 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-BE-NEXT: mtfprwz f4, r5 ; CHECK-BE-NEXT: mffprwz r5, f5 ; CHECK-BE-NEXT: mtfprwz f5, r5 -; CHECK-BE-NEXT: mffprwz r5, f2 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 -; CHECK-BE-NEXT: xscvspdpn f5, vs0 -; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-BE-NEXT: mtfprwz f2, r5 +; CHECK-BE-NEXT: mffprwz r5, f3 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 +; CHECK-BE-NEXT: xscvspdpn f5, vs1 +; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-BE-NEXT: mtfprwz f3, r5 ; CHECK-BE-NEXT: mffprwz r5, f7 ; CHECK-BE-NEXT: mtfprwz f7, r5 ; CHECK-BE-NEXT: mffprwz r5, f8 -; CHECK-BE-NEXT: xxperm vs2, vs6, vs3 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 +; CHECK-BE-NEXT: xxperm vs3, vs6, vs0 +; CHECK-BE-NEXT: xscvspdpn f1, vs1 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 ; CHECK-BE-NEXT: mtfprwz f8, r5 -; CHECK-BE-NEXT: mffprwz r5, f1 -; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs4 +; CHECK-BE-NEXT: mffprwz r5, f2 +; CHECK-BE-NEXT: xxmrghw vs3, vs3, vs4 ; CHECK-BE-NEXT: lxv vs4, 32(r4) -; CHECK-BE-NEXT: xscvdpsxws f0, f0 -; CHECK-BE-NEXT: mtfprwz f1, r5 -; CHECK-BE-NEXT: xxperm vs7, vs8, vs3 +; CHECK-BE-NEXT: xscvdpsxws f1, f1 +; CHECK-BE-NEXT: mtfprwz f2, r5 +; CHECK-BE-NEXT: xxperm vs7, vs8, vs0 ; CHECK-BE-NEXT: mffprwz r5, f10 -; CHECK-BE-NEXT: xxperm vs1, vs9, vs3 +; CHECK-BE-NEXT: xxperm vs2, vs9, vs0 ; CHECK-BE-NEXT: mtfprwz f10, r5 ; CHECK-BE-NEXT: mffprwz r5, f11 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f11, r5 -; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs7 +; CHECK-BE-NEXT: xxmrghw vs2, vs2, vs7 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs10, vs11, vs3 -; CHECK-BE-NEXT: mffprwz r4, f0 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: xxsldwi vs2, vs4, vs4, 3 -; CHECK-BE-NEXT: mtfprwz f0, r4 -; CHECK-BE-NEXT: xxperm vs0, vs5, vs3 +; CHECK-BE-NEXT: xxperm vs10, vs11, vs0 +; CHECK-BE-NEXT: mffprwz r4, f1 +; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 +; CHECK-BE-NEXT: xxsldwi vs3, vs4, vs4, 3 +; CHECK-BE-NEXT: mtfprwz f1, r4 +; CHECK-BE-NEXT: xxperm vs1, vs5, vs0 ; CHECK-BE-NEXT: xxswapd vs5, vs4 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: stxv vs1, 0(r3) +; CHECK-BE-NEXT: xscvspdpn f3, vs3 +; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvdpsxws f2, f2 -; CHECK-BE-NEXT: xxmrghw vs0, vs0, vs10 +; CHECK-BE-NEXT: xscvdpsxws f3, f3 +; CHECK-BE-NEXT: xxmrghw vs1, vs1, vs10 ; CHECK-BE-NEXT: xscvdpsxws f5, f5 -; CHECK-BE-NEXT: mffprwz r4, f2 -; CHECK-BE-NEXT: mtfprwz f2, r4 +; CHECK-BE-NEXT: mffprwz r4, f3 +; CHECK-BE-NEXT: mtfprwz f3, r4 ; CHECK-BE-NEXT: mffprwz r4, f5 ; CHECK-BE-NEXT: mtfprwz f5, r4 -; CHECK-BE-NEXT: xxperm vs2, vs5, vs3 +; CHECK-BE-NEXT: xxperm vs3, vs5, vs0 ; CHECK-BE-NEXT: xscvspdpn f5, vs4 ; CHECK-BE-NEXT: xxsldwi vs4, vs4, vs4, 1 ; CHECK-BE-NEXT: xscvspdpn f4, vs4 @@ -1261,9 +1261,9 @@ define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result ; CHECK-BE-NEXT: mtfprwz f5, r4 ; CHECK-BE-NEXT: mffprwz r4, f4 ; CHECK-BE-NEXT: mtfprwz f4, r4 -; CHECK-BE-NEXT: xxperm vs4, vs5, vs3 -; CHECK-BE-NEXT: xxmrghw vs2, vs4, vs2 -; CHECK-BE-NEXT: xxmrghd vs0, vs2, vs0 +; CHECK-BE-NEXT: xxperm vs4, vs5, vs0 +; CHECK-BE-NEXT: xxmrghw vs0, vs4, vs3 +; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 ; CHECK-BE-NEXT: stxv vs0, 16(r3) ; CHECK-BE-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll index dfa49a8278157..c6e808d145ebb 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll @@ -178,51 +178,51 @@ define i64 @test8elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxswapd v3, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xxswapd v2, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: vmrghb v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghb v2, v2, v5 +; CHECK-P8-NEXT: vmrghb v3, v3, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: vmrglh v2, v2, v4 +; CHECK-P8-NEXT: vmrglh v3, v3, v4 ; CHECK-P8-NEXT: vmrghb v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: vmrghb v3, v3, v0 -; CHECK-P8-NEXT: vmrglh v3, v3, v5 -; CHECK-P8-NEXT: xxmrglw vs0, v3, v2 +; CHECK-P8-NEXT: vmrghb v2, v2, v0 +; CHECK-P8-NEXT: vmrglh v2, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 ; CHECK-P8-NEXT: xxswapd vs0, vs0 ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: blr @@ -343,47 +343,47 @@ entry: define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: li r4, 32 ; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 48 ; CHECK-P8-NEXT: lxvd2x vs8, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 -; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxswapd v3, vs1 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs3 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: mtvsrd v0, r3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxswapd v2, vs2 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f2, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xscvspdpn f2, vs3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: xxswapd v4, vs6 -; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 +; CHECK-P8-NEXT: xxswapd v5, vs6 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xxswapd v4, vs8 ; CHECK-P8-NEXT: mtvsrd v3, r3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, v2 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 -; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v4, v4, 3 -; CHECK-P8-NEXT: xxswapd v5, vs8 -; CHECK-P8-NEXT: xxsldwi vs9, v4, v4, 1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs10, v5, v5, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v5, v5, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 ; CHECK-P8-NEXT: vmrghb v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 @@ -400,7 +400,7 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: vmrglh v3, v3, v0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v1, v1, v6 @@ -414,32 +414,32 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-NEXT: mtvsrd v6, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: vmrglh v2, v2, v1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v6, v6, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: vmrghb v4, v4, v7 +; CHECK-P8-NEXT: vmrghb v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v5 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 -; CHECK-P8-NEXT: vmrglh v4, v4, v6 -; CHECK-P8-NEXT: mtvsrd v5, r3 +; CHECK-P8-NEXT: vmrglh v5, v5, v6 +; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghb v7, v8, v7 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 -; CHECK-P8-NEXT: vmrghb v5, v5, v8 -; CHECK-P8-NEXT: vmrglh v5, v5, v7 -; CHECK-P8-NEXT: xxmrglw vs1, v5, v4 +; CHECK-P8-NEXT: vmrghb v4, v4, v8 +; CHECK-P8-NEXT: vmrglh v4, v4, v7 +; CHECK-P8-NEXT: xxmrglw vs1, v4, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -818,51 +818,51 @@ define i64 @test8elt_signed(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-P8-NEXT: li r4, 16 ; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 -; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxswapd v3, vs0 ; CHECK-P8-NEXT: xscvspdpn f0, vs0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v2 +; CHECK-P8-NEXT: xscvspdpn f0, v3 ; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 +; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 ; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xxswapd v2, vs2 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrd v3, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 +; CHECK-P8-NEXT: xscvspdpn f0, v2 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs4, v3, v3, 3 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: xscvspdpn f1, vs4 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 1 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 ; CHECK-P8-NEXT: vmrghb v4, v4, v5 ; CHECK-P8-NEXT: mtvsrd v5, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 ; CHECK-P8-NEXT: xscvspdpn f1, vs5 ; CHECK-P8-NEXT: mtvsrd v0, r4 -; CHECK-P8-NEXT: vmrghb v2, v2, v5 +; CHECK-P8-NEXT: vmrghb v3, v3, v5 ; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvdpsxws f0, f1 -; CHECK-P8-NEXT: mtvsrd v3, r3 +; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: vmrglh v2, v2, v4 +; CHECK-P8-NEXT: vmrglh v3, v3, v4 ; CHECK-P8-NEXT: vmrghb v5, v5, v0 ; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: vmrghb v3, v3, v0 -; CHECK-P8-NEXT: vmrglh v3, v3, v5 -; CHECK-P8-NEXT: xxmrglw vs0, v3, v2 +; CHECK-P8-NEXT: vmrghb v2, v2, v0 +; CHECK-P8-NEXT: vmrglh v2, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 ; CHECK-P8-NEXT: xxswapd vs0, vs0 ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: blr @@ -983,47 +983,47 @@ entry: define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: li r4, 32 ; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 48 ; CHECK-P8-NEXT: lxvd2x vs8, r3, r4 -; CHECK-P8-NEXT: xxswapd v3, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v3 -; CHECK-P8-NEXT: mtvsrd v0, r3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xxsldwi vs1, v3, v3, 3 +; CHECK-P8-NEXT: xxswapd v3, vs1 ; CHECK-P8-NEXT: xscvspdpn f1, vs1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f0, vs2 -; CHECK-P8-NEXT: mffprwz r4, f1 -; CHECK-P8-NEXT: xscvspdpn f1, vs3 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xscvspdpn f1, v3 +; CHECK-P8-NEXT: mtvsrd v0, r3 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxswapd v2, vs2 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: xxsldwi vs2, v3, v3, 3 +; CHECK-P8-NEXT: xscvspdpn f2, vs2 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: xxsldwi vs3, v3, v3, 1 +; CHECK-P8-NEXT: xscvspdpn f0, vs0 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xscvspdpn f2, vs3 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: xxswapd v4, vs6 -; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 +; CHECK-P8-NEXT: xxswapd v5, vs6 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: xxswapd v4, vs8 ; CHECK-P8-NEXT: mtvsrd v3, r3 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, v2 -; CHECK-P8-NEXT: xxsldwi vs4, v2, v2, 3 -; CHECK-P8-NEXT: xscvspdpn f1, vs4 +; CHECK-P8-NEXT: xxsldwi vs7, v5, v5, 3 +; CHECK-P8-NEXT: xxsldwi vs9, v5, v5, 1 ; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxsldwi vs5, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v4, v4, 3 -; CHECK-P8-NEXT: xxswapd v5, vs8 -; CHECK-P8-NEXT: xxsldwi vs9, v4, v4, 1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xxsldwi vs10, v5, v5, 3 -; CHECK-P8-NEXT: xxsldwi vs11, v5, v5, 1 +; CHECK-P8-NEXT: xxsldwi vs10, v4, v4, 3 +; CHECK-P8-NEXT: xxsldwi vs11, v4, v4, 1 ; CHECK-P8-NEXT: vmrghb v0, v0, v1 ; CHECK-P8-NEXT: mtvsrd v1, r4 ; CHECK-P8-NEXT: mffprwz r4, f1 @@ -1040,7 +1040,7 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v2, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v4 +; CHECK-P8-NEXT: xscvspdpn f0, v5 ; CHECK-P8-NEXT: vmrglh v3, v3, v0 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v1, v1, v6 @@ -1054,32 +1054,32 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #3 ; CHECK-P8-NEXT: mtvsrd v6, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs10 -; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: mtvsrd v5, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs8 ; CHECK-P8-NEXT: vmrglh v2, v2, v1 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: vmrghb v6, v6, v7 ; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: vmrghb v4, v4, v7 +; CHECK-P8-NEXT: vmrghb v5, v5, v7 ; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvspdpn f0, v5 +; CHECK-P8-NEXT: xscvspdpn f0, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: xscvspdpn f0, vs11 -; CHECK-P8-NEXT: vmrglh v4, v4, v6 -; CHECK-P8-NEXT: mtvsrd v5, r3 +; CHECK-P8-NEXT: vmrglh v5, v5, v6 +; CHECK-P8-NEXT: mtvsrd v4, r3 ; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mffprwz r3, f0 ; CHECK-P8-NEXT: vmrghb v7, v8, v7 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: xxmrglw vs0, v2, v3 -; CHECK-P8-NEXT: vmrghb v5, v5, v8 -; CHECK-P8-NEXT: vmrglh v5, v5, v7 -; CHECK-P8-NEXT: xxmrglw vs1, v5, v4 +; CHECK-P8-NEXT: vmrghb v4, v4, v8 +; CHECK-P8-NEXT: vmrglh v4, v4, v7 +; CHECK-P8-NEXT: xxmrglw vs1, v4, v5 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll index 5dcb2e4be3e37..00ca205e85972 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i16_elts.ll @@ -288,92 +288,92 @@ entry: define void @test16elt(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs2, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs5, 0, r4 ; CHECK-P8-NEXT: li r5, 80 ; CHECK-P8-NEXT: li r6, 32 -; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs3, r4, r5 ; CHECK-P8-NEXT: li r5, 16 -; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 ; CHECK-P8-NEXT: li r6, 64 -; CHECK-P8-NEXT: lxvd2x vs4, r4, r5 -; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs7, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs11, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 -; CHECK-P8-NEXT: li r6, 96 ; CHECK-P8-NEXT: lxvd2x vs12, r4, r6 +; CHECK-P8-NEXT: li r6, 96 +; CHECK-P8-NEXT: lxvd2x vs2, r4, r6 ; CHECK-P8-NEXT: li r6, 112 -; CHECK-P8-NEXT: lxvd2x v2, r4, r6 -; CHECK-P8-NEXT: xxswapd vs3, vs2 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: mtvsrd v4, r4 -; CHECK-P8-NEXT: xxswapd vs7, vs6 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r6 +; CHECK-P8-NEXT: xxswapd vs6, vs5 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 ; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: xxswapd vs1, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f7, f7 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxswapd vs5, vs4 +; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: xxswapd vs10, vs9 +; CHECK-P8-NEXT: xscvdpsxws f9, f9 +; CHECK-P8-NEXT: xxswapd vs4, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 ; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r4, f4 -; CHECK-P8-NEXT: xxswapd vs10, vs8 +; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: xxswapd vs13, vs11 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 ; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: xxswapd vs11, vs9 -; CHECK-P8-NEXT: xscvdpsxws f9, f9 -; CHECK-P8-NEXT: mffprwz r4, f6 -; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: xxswapd v2, vs12 +; CHECK-P8-NEXT: xscvdpsxws f12, f12 ; CHECK-P8-NEXT: mffprwz r4, f9 +; CHECK-P8-NEXT: xscvdpsxws v2, v2 +; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: mffprwz r4, f12 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f8 -; CHECK-P8-NEXT: xxswapd vs13, vs12 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mffprwz r4, f11 +; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xscvdpsxws v3, v3 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mtvsrd v6, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, f12 -; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f3 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f6 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f5 -; CHECK-P8-NEXT: xxswapd v3, v2 +; CHECK-P8-NEXT: mffprwz r4, f8 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: mffprwz r4, f10 ; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mffprwz r4, f11 +; CHECK-P8-NEXT: mfvsrwz r4, v2 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 ; CHECK-P8-NEXT: vmrghh v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f10 +; CHECK-P8-NEXT: mffprwz r4, f4 ; CHECK-P8-NEXT: vmrghh v5, v9, v5 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: mfvsrwz r4, v3 ; CHECK-P8-NEXT: vmrghh v0, v10, v0 -; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mffprwz r4, f13 -; CHECK-P8-NEXT: vmrghh v1, v8, v1 -; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v1, v0 -; CHECK-P8-NEXT: vmrghh v6, v9, v6 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v2 -; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: vmrghh v2, v2, v1 +; CHECK-P8-NEXT: vmrghh v3, v8, v6 +; CHECK-P8-NEXT: mtvsrd v6, r4 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: vmrghh v1, v9, v7 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xxmrglw vs2, v1, v3 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v0 +; CHECK-P8-NEXT: vmrghh v6, v6, v7 +; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: mtvsrd v2, r4 -; CHECK-P8-NEXT: vmrghh v7, v10, v7 -; CHECK-P8-NEXT: xxmrglw vs2, v7, v6 -; CHECK-P8-NEXT: vmrghh v8, v8, v9 ; CHECK-P8-NEXT: xxmrglw vs0, v5, v4 -; CHECK-P8-NEXT: vmrghh v2, v3, v2 -; CHECK-P8-NEXT: xxmrglw vs3, v2, v8 +; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 -; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs1, v2 +; CHECK-P8-NEXT: vmrghh v7, v7, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v7, v6 +; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 ; CHECK-P8-NEXT: stxvd2x vs1, 0, r3 @@ -835,92 +835,92 @@ entry: define void @test16elt_signed(ptr noalias nocapture sret(<16 x i16>) %agg.result, ptr nocapture readonly) local_unnamed_addr #3 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: lxvd2x vs2, 0, r4 +; CHECK-P8-NEXT: lxvd2x vs5, 0, r4 ; CHECK-P8-NEXT: li r5, 80 ; CHECK-P8-NEXT: li r6, 32 -; CHECK-P8-NEXT: lxvd2x vs0, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs3, r4, r5 ; CHECK-P8-NEXT: li r5, 16 -; CHECK-P8-NEXT: lxvd2x vs6, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 ; CHECK-P8-NEXT: li r6, 64 -; CHECK-P8-NEXT: lxvd2x vs4, r4, r5 -; CHECK-P8-NEXT: lxvd2x vs8, r4, r6 +; CHECK-P8-NEXT: lxvd2x vs7, r4, r5 +; CHECK-P8-NEXT: lxvd2x vs11, r4, r6 ; CHECK-P8-NEXT: li r6, 48 -; CHECK-P8-NEXT: lxvd2x vs9, r4, r6 -; CHECK-P8-NEXT: li r6, 96 ; CHECK-P8-NEXT: lxvd2x vs12, r4, r6 +; CHECK-P8-NEXT: li r6, 96 +; CHECK-P8-NEXT: lxvd2x vs2, r4, r6 ; CHECK-P8-NEXT: li r6, 112 -; CHECK-P8-NEXT: lxvd2x v2, r4, r6 -; CHECK-P8-NEXT: xxswapd vs3, vs2 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: mffprwz r4, f2 -; CHECK-P8-NEXT: mtvsrd v4, r4 -; CHECK-P8-NEXT: xxswapd vs7, vs6 +; CHECK-P8-NEXT: lxvd2x vs0, r4, r6 +; CHECK-P8-NEXT: xxswapd vs6, vs5 +; CHECK-P8-NEXT: xscvdpsxws f5, f5 ; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: xxswapd vs1, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f7, f7 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: xxswapd vs5, vs4 +; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: mtvsrd v4, r4 +; CHECK-P8-NEXT: xxswapd vs10, vs9 +; CHECK-P8-NEXT: xscvdpsxws f9, f9 +; CHECK-P8-NEXT: xxswapd vs4, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 ; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r4, f4 -; CHECK-P8-NEXT: xxswapd vs10, vs8 +; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: xxswapd vs13, vs11 +; CHECK-P8-NEXT: xscvdpsxws f11, f11 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 ; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: xxswapd vs11, vs9 -; CHECK-P8-NEXT: xscvdpsxws f9, f9 -; CHECK-P8-NEXT: mffprwz r4, f6 -; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: xxswapd v2, vs12 +; CHECK-P8-NEXT: xscvdpsxws f12, f12 ; CHECK-P8-NEXT: mffprwz r4, f9 +; CHECK-P8-NEXT: xscvdpsxws v2, v2 +; CHECK-P8-NEXT: mtvsrd v0, r4 +; CHECK-P8-NEXT: mffprwz r4, f12 ; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f8 -; CHECK-P8-NEXT: xxswapd vs13, vs12 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mffprwz r4, f11 +; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xscvdpsxws v3, v3 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 ; CHECK-P8-NEXT: mtvsrd v6, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, f12 -; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f3 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f6 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f5 -; CHECK-P8-NEXT: xxswapd v3, v2 +; CHECK-P8-NEXT: mffprwz r4, f8 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f7 +; CHECK-P8-NEXT: mffprwz r4, f10 ; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mffprwz r4, f11 +; CHECK-P8-NEXT: mfvsrwz r4, v2 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 ; CHECK-P8-NEXT: vmrghh v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f10 +; CHECK-P8-NEXT: mffprwz r4, f4 ; CHECK-P8-NEXT: vmrghh v5, v9, v5 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: mfvsrwz r4, v3 ; CHECK-P8-NEXT: vmrghh v0, v10, v0 -; CHECK-P8-NEXT: mtvsrd v10, r4 -; CHECK-P8-NEXT: mffprwz r4, f13 -; CHECK-P8-NEXT: vmrghh v1, v8, v1 -; CHECK-P8-NEXT: mtvsrd v8, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v1, v0 -; CHECK-P8-NEXT: vmrghh v6, v9, v6 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v2 -; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: vmrghh v2, v2, v1 +; CHECK-P8-NEXT: vmrghh v3, v8, v6 +; CHECK-P8-NEXT: mtvsrd v6, r4 +; CHECK-P8-NEXT: mffprwz r4, f2 +; CHECK-P8-NEXT: vmrghh v1, v9, v7 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f1 +; CHECK-P8-NEXT: xxmrglw vs2, v1, v3 +; CHECK-P8-NEXT: xxmrglw vs1, v2, v0 +; CHECK-P8-NEXT: vmrghh v6, v6, v7 +; CHECK-P8-NEXT: mtvsrd v7, r4 ; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: mtvsrd v2, r4 -; CHECK-P8-NEXT: vmrghh v7, v10, v7 -; CHECK-P8-NEXT: xxmrglw vs2, v7, v6 -; CHECK-P8-NEXT: vmrghh v8, v8, v9 ; CHECK-P8-NEXT: xxmrglw vs0, v5, v4 -; CHECK-P8-NEXT: vmrghh v2, v3, v2 -; CHECK-P8-NEXT: xxmrglw vs3, v2, v8 +; CHECK-P8-NEXT: mtvsrd v8, r4 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 -; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs1, v2 +; CHECK-P8-NEXT: vmrghh v7, v7, v8 +; CHECK-P8-NEXT: xxmrglw vs3, v7, v6 +; CHECK-P8-NEXT: xxmrgld v3, vs3, vs2 ; CHECK-P8-NEXT: xxswapd vs0, v3 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r5 ; CHECK-P8-NEXT: stxvd2x vs1, 0, r3 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll index dd5cb59a48bf0..770689ba98049 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp64_to_i8_elts.ll @@ -303,90 +303,90 @@ define <16 x i8> @test16elt(ptr nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: li r4, 80 -; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 -; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 -; CHECK-P8-NEXT: li r4, 48 +; CHECK-P8-NEXT: lxvd2x vs4, 0, r3 ; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 +; CHECK-P8-NEXT: li r4, 48 +; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs4, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs7, r3, r4 ; CHECK-P8-NEXT: li r4, 32 -; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 -; CHECK-P8-NEXT: li r4, 64 ; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 -; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: li r4, 64 ; CHECK-P8-NEXT: lxvd2x vs12, r3, r4 +; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: lxvd2x v2, r3, r4 -; CHECK-P8-NEXT: xxswapd vs2, vs1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: mtvsrd v4, r3 -; CHECK-P8-NEXT: xxswapd vs10, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 -; CHECK-P8-NEXT: xxswapd vs7, vs3 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: xxswapd vs5, vs4 ; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: mffprwz r3, f4 ; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: mffprwz r4, f3 -; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xxswapd vs8, vs6 +; CHECK-P8-NEXT: mtvsrd v4, r3 +; CHECK-P8-NEXT: xxswapd vs13, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: xxswapd vs10, vs6 ; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: mffprwz r3, f6 -; CHECK-P8-NEXT: mtvsrd v0, r3 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 +; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: mffprwz r4, f7 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f0, f12 -; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: mtvsrd v5, r4 +; CHECK-P8-NEXT: mffprwz r4, f6 +; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: mffprwz r4, f3 ; CHECK-P8-NEXT: xxswapd vs11, vs9 ; CHECK-P8-NEXT: xscvdpsxws f9, f9 ; CHECK-P8-NEXT: mffprwz r3, f9 -; CHECK-P8-NEXT: mtvsrd v6, r3 -; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: mtvsrd v0, r3 ; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f8 -; CHECK-P8-NEXT: xxswapd vs13, vs12 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f8 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f7 -; CHECK-P8-NEXT: xxswapd v3, v2 -; CHECK-P8-NEXT: vmrghb v4, v8, v4 +; CHECK-P8-NEXT: xxswapd v2, vs12 +; CHECK-P8-NEXT: xscvdpsxws f12, f12 +; CHECK-P8-NEXT: mffprwz r3, f12 +; CHECK-P8-NEXT: mtvsrd v6, r3 +; CHECK-P8-NEXT: mffprwz r3, f5 +; CHECK-P8-NEXT: xscvdpsxws v2, v2 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: mffprwz r3, f11 -; CHECK-P8-NEXT: vmrghb v5, v9, v5 -; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xscvdpsxws v3, v3 ; CHECK-P8-NEXT: mffprwz r4, f10 -; CHECK-P8-NEXT: vmrghb v0, v8, v0 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: vmrghb v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f13 -; CHECK-P8-NEXT: vmrghb v1, v9, v1 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: vmrghb v6, v8, v6 +; CHECK-P8-NEXT: mfvsrwz r3, v2 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: vmrghb v5, v9, v5 +; CHECK-P8-NEXT: vmrghb v0, v8, v0 ; CHECK-P8-NEXT: mtvsrd v8, r3 +; CHECK-P8-NEXT: mfvsrwz r3, v3 +; CHECK-P8-NEXT: vmrglh v4, v5, v4 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: vmrghb v2, v2, v1 +; CHECK-P8-NEXT: vmrghb v1, v8, v6 +; CHECK-P8-NEXT: mtvsrd v6, r3 +; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: vmrglh v2, v2, v0 +; CHECK-P8-NEXT: vmrghb v3, v3, v7 +; CHECK-P8-NEXT: mtvsrd v7, r3 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: vmrglh v3, v3, v1 +; CHECK-P8-NEXT: vmrghb v6, v6, v7 +; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v3 -; CHECK-P8-NEXT: vmrghb v7, v9, v7 -; CHECK-P8-NEXT: mtvsrd v9, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v2 -; CHECK-P8-NEXT: mtvsrd v3, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: mtvsrd v2, r3 -; CHECK-P8-NEXT: vmrghb v8, v8, v9 -; CHECK-P8-NEXT: vmrghb v2, v3, v2 -; CHECK-P8-NEXT: vmrglh v3, v5, v4 -; CHECK-P8-NEXT: vmrglh v4, v1, v0 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 +; CHECK-P8-NEXT: mtvsrd v8, r3 +; CHECK-P8-NEXT: vmrghb v7, v7, v8 ; CHECK-P8-NEXT: vmrglh v5, v7, v6 -; CHECK-P8-NEXT: vmrglh v2, v2, v8 -; CHECK-P8-NEXT: xxmrglw vs0, v4, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs1, v5, v3 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; @@ -858,90 +858,90 @@ define <16 x i8> @test16elt_signed(ptr nocapture readonly) local_unnamed_addr #2 ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: li r4, 80 -; CHECK-P8-NEXT: lxvd2x vs1, 0, r3 -; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 -; CHECK-P8-NEXT: li r4, 48 +; CHECK-P8-NEXT: lxvd2x vs4, 0, r3 ; CHECK-P8-NEXT: lxvd2x vs3, r3, r4 +; CHECK-P8-NEXT: li r4, 48 +; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: lxvd2x vs4, r3, r4 +; CHECK-P8-NEXT: lxvd2x vs7, r3, r4 ; CHECK-P8-NEXT: li r4, 32 -; CHECK-P8-NEXT: lxvd2x vs6, r3, r4 -; CHECK-P8-NEXT: li r4, 64 ; CHECK-P8-NEXT: lxvd2x vs9, r3, r4 -; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: li r4, 64 ; CHECK-P8-NEXT: lxvd2x vs12, r3, r4 +; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: lxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: lxvd2x v2, r3, r4 -; CHECK-P8-NEXT: xxswapd vs2, vs1 -; CHECK-P8-NEXT: xscvdpsxws f1, f1 -; CHECK-P8-NEXT: mffprwz r3, f1 -; CHECK-P8-NEXT: xscvdpsxws f2, f2 -; CHECK-P8-NEXT: mtvsrd v4, r3 -; CHECK-P8-NEXT: xxswapd vs10, vs0 -; CHECK-P8-NEXT: xscvdpsxws f0, f0 -; CHECK-P8-NEXT: xscvdpsxws f10, f10 -; CHECK-P8-NEXT: xxswapd vs7, vs3 -; CHECK-P8-NEXT: xscvdpsxws f3, f3 -; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: lxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: xxswapd vs5, vs4 ; CHECK-P8-NEXT: xscvdpsxws f4, f4 -; CHECK-P8-NEXT: mffprwz r4, f4 +; CHECK-P8-NEXT: mffprwz r3, f4 ; CHECK-P8-NEXT: xscvdpsxws f5, f5 -; CHECK-P8-NEXT: mtvsrd v5, r4 -; CHECK-P8-NEXT: mffprwz r4, f3 -; CHECK-P8-NEXT: mtvsrd v1, r4 -; CHECK-P8-NEXT: mffprwz r4, f0 -; CHECK-P8-NEXT: xxswapd vs8, vs6 +; CHECK-P8-NEXT: mtvsrd v4, r3 +; CHECK-P8-NEXT: xxswapd vs13, vs3 +; CHECK-P8-NEXT: xscvdpsxws f3, f3 +; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: xxswapd vs10, vs6 ; CHECK-P8-NEXT: xscvdpsxws f6, f6 -; CHECK-P8-NEXT: mffprwz r3, f6 -; CHECK-P8-NEXT: mtvsrd v0, r3 +; CHECK-P8-NEXT: xscvdpsxws f10, f10 +; CHECK-P8-NEXT: xxswapd vs8, vs7 +; CHECK-P8-NEXT: xscvdpsxws f7, f7 +; CHECK-P8-NEXT: mffprwz r4, f7 ; CHECK-P8-NEXT: xscvdpsxws f8, f8 -; CHECK-P8-NEXT: xscvdpsxws f0, f12 -; CHECK-P8-NEXT: mtvsrd v7, r4 -; CHECK-P8-NEXT: mffprwz r4, f5 +; CHECK-P8-NEXT: mtvsrd v5, r4 +; CHECK-P8-NEXT: mffprwz r4, f6 +; CHECK-P8-NEXT: mtvsrd v1, r4 +; CHECK-P8-NEXT: mffprwz r4, f3 ; CHECK-P8-NEXT: xxswapd vs11, vs9 ; CHECK-P8-NEXT: xscvdpsxws f9, f9 ; CHECK-P8-NEXT: mffprwz r3, f9 -; CHECK-P8-NEXT: mtvsrd v6, r3 -; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: mtvsrd v0, r3 ; CHECK-P8-NEXT: xscvdpsxws f11, f11 -; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f8 -; CHECK-P8-NEXT: xxswapd vs13, vs12 -; CHECK-P8-NEXT: xscvdpsxws f13, f13 +; CHECK-P8-NEXT: mtvsrd v7, r4 +; CHECK-P8-NEXT: mffprwz r4, f8 ; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: mffprwz r4, f7 -; CHECK-P8-NEXT: xxswapd v3, v2 -; CHECK-P8-NEXT: vmrghb v4, v8, v4 +; CHECK-P8-NEXT: xxswapd v2, vs12 +; CHECK-P8-NEXT: xscvdpsxws f12, f12 +; CHECK-P8-NEXT: mffprwz r3, f12 +; CHECK-P8-NEXT: mtvsrd v6, r3 +; CHECK-P8-NEXT: mffprwz r3, f5 +; CHECK-P8-NEXT: xscvdpsxws v2, v2 ; CHECK-P8-NEXT: mtvsrd v8, r3 ; CHECK-P8-NEXT: mffprwz r3, f11 -; CHECK-P8-NEXT: vmrghb v5, v9, v5 -; CHECK-P8-NEXT: mtvsrd v9, r4 +; CHECK-P8-NEXT: xxswapd v3, vs2 +; CHECK-P8-NEXT: xscvdpsxws v3, v3 ; CHECK-P8-NEXT: mffprwz r4, f10 -; CHECK-P8-NEXT: vmrghb v0, v8, v0 +; CHECK-P8-NEXT: xscvdpsxws f2, f2 +; CHECK-P8-NEXT: xxswapd vs1, vs0 +; CHECK-P8-NEXT: xscvdpsxws f1, f1 +; CHECK-P8-NEXT: xscvdpsxws f0, f0 +; CHECK-P8-NEXT: vmrghb v4, v8, v4 ; CHECK-P8-NEXT: mtvsrd v8, r3 -; CHECK-P8-NEXT: mffprwz r3, f13 -; CHECK-P8-NEXT: vmrghb v1, v9, v1 -; CHECK-P8-NEXT: mtvsrd v9, r4 -; CHECK-P8-NEXT: vmrghb v6, v8, v6 +; CHECK-P8-NEXT: mfvsrwz r3, v2 +; CHECK-P8-NEXT: mtvsrd v2, r4 +; CHECK-P8-NEXT: mffprwz r4, f13 +; CHECK-P8-NEXT: vmrghb v5, v9, v5 +; CHECK-P8-NEXT: vmrghb v0, v8, v0 ; CHECK-P8-NEXT: mtvsrd v8, r3 +; CHECK-P8-NEXT: mfvsrwz r3, v3 +; CHECK-P8-NEXT: vmrglh v4, v5, v4 +; CHECK-P8-NEXT: mtvsrd v3, r4 +; CHECK-P8-NEXT: vmrghb v2, v2, v1 +; CHECK-P8-NEXT: vmrghb v1, v8, v6 +; CHECK-P8-NEXT: mtvsrd v6, r3 +; CHECK-P8-NEXT: mffprwz r3, f2 +; CHECK-P8-NEXT: vmrglh v2, v2, v0 +; CHECK-P8-NEXT: vmrghb v3, v3, v7 +; CHECK-P8-NEXT: mtvsrd v7, r3 +; CHECK-P8-NEXT: mffprwz r3, f1 +; CHECK-P8-NEXT: vmrglh v3, v3, v1 +; CHECK-P8-NEXT: vmrghb v6, v6, v7 +; CHECK-P8-NEXT: mtvsrd v7, r3 ; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v3 -; CHECK-P8-NEXT: vmrghb v7, v9, v7 -; CHECK-P8-NEXT: mtvsrd v9, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: xscvdpsxws f0, v2 -; CHECK-P8-NEXT: mtvsrd v3, r3 -; CHECK-P8-NEXT: mffprwz r3, f0 -; CHECK-P8-NEXT: mtvsrd v2, r3 -; CHECK-P8-NEXT: vmrghb v8, v8, v9 -; CHECK-P8-NEXT: vmrghb v2, v3, v2 -; CHECK-P8-NEXT: vmrglh v3, v5, v4 -; CHECK-P8-NEXT: vmrglh v4, v1, v0 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v4 +; CHECK-P8-NEXT: mtvsrd v8, r3 +; CHECK-P8-NEXT: vmrghb v7, v7, v8 ; CHECK-P8-NEXT: vmrglh v5, v7, v6 -; CHECK-P8-NEXT: vmrglh v2, v2, v8 -; CHECK-P8-NEXT: xxmrglw vs0, v4, v3 -; CHECK-P8-NEXT: xxmrglw vs1, v2, v5 +; CHECK-P8-NEXT: xxmrglw vs1, v5, v3 ; CHECK-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-P8-NEXT: blr ; diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll index a9636cdf8bb17..277951782ce5c 100644 --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -108,6 +108,7 @@ ; CHECK-NEXT: RISC-V Pre-RA pseudo instruction expansion pass ; CHECK-NEXT: RISC-V Merge Base Offset ; CHECK-NEXT: RISC-V Insert VSETVLI pass +; CHECK-NEXT: RISC-V Dead register definitions ; CHECK-NEXT: RISC-V Insert Read/Write CSR Pass ; CHECK-NEXT: Detect Dead Lanes ; CHECK-NEXT: RISC-V init undef pass diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll index e952e71a5d736..895852b84e004 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll @@ -7,12 +7,12 @@ define void @amoswap_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amoswap_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amoswap.w.aqrl a0, a1, (a0) +; RV32-NEXT: amoswap.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amoswap_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoswap.w.aqrl a0, a1, (a0) +; RV64-NEXT: amoswap.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw xchg ptr %a, i32 %b seq_cst ret void @@ -31,7 +31,7 @@ define void @amoswap_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amoswap_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoswap.d.aqrl a0, a1, (a0) +; RV64-NEXT: amoswap.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw xchg ptr %a, i64 %b seq_cst ret void @@ -40,12 +40,12 @@ define void @amoswap_d_discard(ptr %a, i64 %b) nounwind { define void @amoadd_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amoadd_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV32-NEXT: amoadd.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amoadd_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV64-NEXT: amoadd.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw add ptr %a, i32 %b seq_cst ret void @@ -64,7 +64,7 @@ define void @amoadd_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amoadd_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoadd.d.aqrl a0, a1, (a0) +; RV64-NEXT: amoadd.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw add ptr %a, i64 %b seq_cst ret void @@ -73,12 +73,12 @@ define void @amoadd_d_discard(ptr %a, i64 %b) nounwind { define void @amoand_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amoand_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amoand.w.aqrl a0, a1, (a0) +; RV32-NEXT: amoand.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amoand_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoand.w.aqrl a0, a1, (a0) +; RV64-NEXT: amoand.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw and ptr %a, i32 %b seq_cst ret void @@ -97,7 +97,7 @@ define void @amoand_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amoand_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoand.d.aqrl a0, a1, (a0) +; RV64-NEXT: amoand.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw and ptr %a, i64 %b seq_cst ret void @@ -106,12 +106,12 @@ define void @amoand_d_discard(ptr %a, i64 %b) nounwind { define void @amoor_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amoor_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amoor.w.aqrl a0, a1, (a0) +; RV32-NEXT: amoor.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amoor_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoor.w.aqrl a0, a1, (a0) +; RV64-NEXT: amoor.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw or ptr %a, i32 %b seq_cst ret void @@ -130,7 +130,7 @@ define void @amoor_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amoor_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoor.d.aqrl a0, a1, (a0) +; RV64-NEXT: amoor.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw or ptr %a, i64 %b seq_cst ret void @@ -139,12 +139,12 @@ define void @amoor_d_discard(ptr %a, i64 %b) nounwind { define void @amoxor_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amoxor_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amoor.w.aqrl a0, a1, (a0) +; RV32-NEXT: amoor.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amoxor_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoor.w.aqrl a0, a1, (a0) +; RV64-NEXT: amoor.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw or ptr %a, i32 %b seq_cst ret void @@ -163,7 +163,7 @@ define void @amoxor_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amoxor_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amoor.d.aqrl a0, a1, (a0) +; RV64-NEXT: amoor.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw or ptr %a, i64 %b seq_cst ret void @@ -172,12 +172,12 @@ define void @amoxor_d_discard(ptr %a, i64 %b) nounwind { define void @amomax_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amomax_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV32-NEXT: amomax.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amomax_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV64-NEXT: amomax.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw max ptr %a, i32 %b seq_cst ret void @@ -239,7 +239,7 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amomax_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amomax.d.aqrl a0, a1, (a0) +; RV64-NEXT: amomax.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw max ptr %a, i64 %b seq_cst ret void @@ -248,12 +248,12 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind { define void @amomaxu_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amomaxu_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV32-NEXT: amomaxu.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amomaxu_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV64-NEXT: amomaxu.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw umax ptr %a, i32 %b seq_cst ret void @@ -315,7 +315,7 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amomaxu_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amomaxu.d.aqrl a0, a1, (a0) +; RV64-NEXT: amomaxu.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw umax ptr %a, i64 %b seq_cst ret void @@ -324,12 +324,12 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind { define void @amomin_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amomin_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV32-NEXT: amomin.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amomin_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV64-NEXT: amomin.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw min ptr %a, i32 %b seq_cst ret void @@ -391,7 +391,7 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amomin_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amomin.d.aqrl a0, a1, (a0) +; RV64-NEXT: amomin.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw min ptr %a, i64 %b seq_cst ret void @@ -400,12 +400,12 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind { define void @amominu_w_discard(ptr %a, i32 %b) nounwind { ; RV32-LABEL: amominu_w_discard: ; RV32: # %bb.0: -; RV32-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV32-NEXT: amominu.w.aqrl zero, a1, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: amominu_w_discard: ; RV64: # %bb.0: -; RV64-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV64-NEXT: amominu.w.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw umin ptr %a, i32 %b seq_cst ret void @@ -467,7 +467,7 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind { ; ; RV64-LABEL: amominu_d_discard: ; RV64: # %bb.0: -; RV64-NEXT: amominu.d.aqrl a0, a1, (a0) +; RV64-NEXT: amominu.d.aqrl zero, a1, (a0) ; RV64-NEXT: ret %1 = atomicrmw umin ptr %a, i64 %b seq_cst ret void diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll index c47d4484d8c0d..c4f224dcba1b2 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll @@ -515,16 +515,27 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_0_i8_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a2, 255 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: not a2, a2 -; RV32IA-NEXT: amoand.w.aq a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i8_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a2, 255 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: not a2, a2 +; RV32IA-WMO-NEXT: amoand.w.aq a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i8_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a2, 255 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: not a2, a2 +; RV32IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_0_i8_acquire: ; RV64I: # %bb.0: @@ -537,16 +548,27 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_0_i8_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a2, 255 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: not a2, a2 -; RV64IA-NEXT: amoand.w.aq a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_0_i8_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a2, 255 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: not a2, a2 +; RV64IA-WMO-NEXT: amoand.w.aq a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_0_i8_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a2, 255 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: not a2, a2 +; RV64IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 0 acquire ret i8 %1 } @@ -563,16 +585,27 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_0_i8_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a2, 255 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: not a2, a2 -; RV32IA-NEXT: amoand.w.rl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i8_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a2, 255 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: not a2, a2 +; RV32IA-WMO-NEXT: amoand.w.rl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i8_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a2, 255 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: not a2, a2 +; RV32IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_0_i8_release: ; RV64I: # %bb.0: @@ -585,16 +618,27 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_0_i8_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a2, 255 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: not a2, a2 -; RV64IA-NEXT: amoand.w.rl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_0_i8_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a2, 255 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: not a2, a2 +; RV64IA-WMO-NEXT: amoand.w.rl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_0_i8_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a2, 255 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: not a2, a2 +; RV64IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 0 release ret i8 %1 } @@ -611,16 +655,27 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_0_i8_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a2, 255 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: not a2, a2 -; RV32IA-NEXT: amoand.w.aqrl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i8_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a2, 255 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: not a2, a2 +; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i8_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a2, 255 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: not a2, a2 +; RV32IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_0_i8_acq_rel: ; RV64I: # %bb.0: @@ -633,16 +688,27 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_0_i8_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a2, 255 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: not a2, a2 -; RV64IA-NEXT: amoand.w.aqrl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_0_i8_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a2, 255 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: not a2, a2 +; RV64IA-WMO-NEXT: amoand.w.aqrl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_0_i8_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a2, 255 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: not a2, a2 +; RV64IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 0 acq_rel ret i8 %1 } @@ -659,16 +725,27 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_0_i8_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a2, 255 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: not a2, a2 -; RV32IA-NEXT: amoand.w.aqrl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i8_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a2, 255 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: not a2, a2 +; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i8_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a2, 255 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: not a2, a2 +; RV32IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_0_i8_seq_cst: ; RV64I: # %bb.0: @@ -681,16 +758,27 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_0_i8_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a2, 255 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: not a2, a2 -; RV64IA-NEXT: amoand.w.aqrl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_0_i8_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a2, 255 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: not a2, a2 +; RV64IA-WMO-NEXT: amoand.w.aqrl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_0_i8_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a2, 255 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: not a2, a2 +; RV64IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 0 seq_cst ret i8 %1 } @@ -753,15 +841,25 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_minus_1_i8_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a2, 255 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: amoor.w.aq a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a2, 255 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: amoor.w.aq a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a2, 255 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_minus_1_i8_acquire: ; RV64I: # %bb.0: @@ -774,15 +872,25 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_minus_1_i8_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a2, 255 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: amoor.w.aq a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a2, 255 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: amoor.w.aq a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a2, 255 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 -1 acquire ret i8 %1 } @@ -799,15 +907,25 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_minus_1_i8_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a2, 255 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: amoor.w.rl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a2, 255 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: amoor.w.rl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a2, 255 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_minus_1_i8_release: ; RV64I: # %bb.0: @@ -820,15 +938,25 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_minus_1_i8_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a2, 255 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: amoor.w.rl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a2, 255 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: amoor.w.rl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a2, 255 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 -1 release ret i8 %1 } @@ -845,15 +973,25 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a2, 255 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: amoor.w.aqrl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a2, 255 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: amoor.w.aqrl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a2, 255 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel: ; RV64I: # %bb.0: @@ -866,15 +1004,25 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a2, 255 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: amoor.w.aqrl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a2, 255 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: amoor.w.aqrl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a2, 255 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 -1 acq_rel ret i8 %1 } @@ -891,15 +1039,25 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a2, 255 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: amoor.w.aqrl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a2, 255 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: amoor.w.aqrl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a2, 255 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst: ; RV64I: # %bb.0: @@ -912,15 +1070,25 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a2, 255 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: amoor.w.aqrl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a2, 255 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: amoor.w.aqrl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a2, 255 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i8 -1 seq_cst ret i8 %1 } @@ -1868,43 +2036,71 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i8_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 -; RV32IA-NEXT: not a3, a3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a3, a1 -; RV32IA-NEXT: amoand.w.aq a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i8_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a3, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 +; RV32IA-WMO-NEXT: not a3, a3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: or a1, a3, a1 +; RV32IA-WMO-NEXT: amoand.w.aq a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret ; -; RV64I-LABEL: atomicrmw_and_i8_acquire: -; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV32IA-TSO-LABEL: atomicrmw_and_i8_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a3, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 +; RV32IA-TSO-NEXT: not a3, a3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: or a1, a3, a1 +; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret +; +; RV64I-LABEL: atomicrmw_and_i8_acquire: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: li a2, 2 ; RV64I-NEXT: call __atomic_fetch_and_1@plt ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i8_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 -; RV64IA-NEXT: not a3, a3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a3, a1 -; RV64IA-NEXT: amoand.w.aq a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i8_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a3, 255 +; RV64IA-WMO-NEXT: sllw a3, a3, a0 +; RV64IA-WMO-NEXT: not a3, a3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: or a1, a3, a1 +; RV64IA-WMO-NEXT: amoand.w.aq a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i8_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a3, 255 +; RV64IA-TSO-NEXT: sllw a3, a3, a0 +; RV64IA-TSO-NEXT: not a3, a3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: or a1, a3, a1 +; RV64IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i8 %b acquire ret i8 %1 } @@ -1920,19 +2116,33 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i8_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 -; RV32IA-NEXT: not a3, a3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a3, a1 -; RV32IA-NEXT: amoand.w.rl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i8_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a3, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 +; RV32IA-WMO-NEXT: not a3, a3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: or a1, a3, a1 +; RV32IA-WMO-NEXT: amoand.w.rl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i8_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a3, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 +; RV32IA-TSO-NEXT: not a3, a3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: or a1, a3, a1 +; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i8_release: ; RV64I: # %bb.0: @@ -1944,19 +2154,33 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i8_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 -; RV64IA-NEXT: not a3, a3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a3, a1 -; RV64IA-NEXT: amoand.w.rl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i8_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a3, 255 +; RV64IA-WMO-NEXT: sllw a3, a3, a0 +; RV64IA-WMO-NEXT: not a3, a3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: or a1, a3, a1 +; RV64IA-WMO-NEXT: amoand.w.rl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i8_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a3, 255 +; RV64IA-TSO-NEXT: sllw a3, a3, a0 +; RV64IA-TSO-NEXT: not a3, a3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: or a1, a3, a1 +; RV64IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i8 %b release ret i8 %1 } @@ -1972,19 +2196,33 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i8_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 -; RV32IA-NEXT: not a3, a3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a3, a1 -; RV32IA-NEXT: amoand.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i8_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a3, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 +; RV32IA-WMO-NEXT: not a3, a3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: or a1, a3, a1 +; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i8_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a3, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 +; RV32IA-TSO-NEXT: not a3, a3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: or a1, a3, a1 +; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i8_acq_rel: ; RV64I: # %bb.0: @@ -1996,19 +2234,33 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i8_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 -; RV64IA-NEXT: not a3, a3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a3, a1 -; RV64IA-NEXT: amoand.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i8_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a3, 255 +; RV64IA-WMO-NEXT: sllw a3, a3, a0 +; RV64IA-WMO-NEXT: not a3, a3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: or a1, a3, a1 +; RV64IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i8_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a3, 255 +; RV64IA-TSO-NEXT: sllw a3, a3, a0 +; RV64IA-TSO-NEXT: not a3, a3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: or a1, a3, a1 +; RV64IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i8 %b acq_rel ret i8 %1 } @@ -2024,19 +2276,33 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i8_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: li a3, 255 -; RV32IA-NEXT: sll a3, a3, a0 -; RV32IA-NEXT: not a3, a3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a3, a1 -; RV32IA-NEXT: amoand.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i8_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: li a3, 255 +; RV32IA-WMO-NEXT: sll a3, a3, a0 +; RV32IA-WMO-NEXT: not a3, a3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: or a1, a3, a1 +; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i8_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: li a3, 255 +; RV32IA-TSO-NEXT: sll a3, a3, a0 +; RV32IA-TSO-NEXT: not a3, a3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: or a1, a3, a1 +; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i8_seq_cst: ; RV64I: # %bb.0: @@ -2048,19 +2314,33 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i8_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 -; RV64IA-NEXT: not a3, a3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a3, a1 -; RV64IA-NEXT: amoand.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i8_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: li a3, 255 +; RV64IA-WMO-NEXT: sllw a3, a3, a0 +; RV64IA-WMO-NEXT: not a3, a3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: or a1, a3, a1 +; RV64IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i8_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: li a3, 255 +; RV64IA-TSO-NEXT: sllw a3, a3, a0 +; RV64IA-TSO-NEXT: not a3, a3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: or a1, a3, a1 +; RV64IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i8 %b seq_cst ret i8 %1 } @@ -2576,15 +2856,25 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i8_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoor.w.aq a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i8_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoor.w.aq a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i8_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i8_acquire: ; RV64I: # %bb.0: @@ -2596,15 +2886,25 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i8_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoor.w.aq a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i8_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoor.w.aq a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i8_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i8 %b acquire ret i8 %1 } @@ -2620,15 +2920,25 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i8_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoor.w.rl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i8_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoor.w.rl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i8_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i8_release: ; RV64I: # %bb.0: @@ -2640,15 +2950,25 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i8_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoor.w.rl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i8_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoor.w.rl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i8_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i8 %b release ret i8 %1 } @@ -2664,15 +2984,25 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i8_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoor.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i8_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoor.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i8_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i8_acq_rel: ; RV64I: # %bb.0: @@ -2684,15 +3014,25 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i8_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoor.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i8_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoor.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i8_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i8 %b acq_rel ret i8 %1 } @@ -2708,15 +3048,25 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i8_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoor.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i8_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoor.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i8_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i8_seq_cst: ; RV64I: # %bb.0: @@ -2728,15 +3078,25 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i8_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoor.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i8_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoor.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i8_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i8 %b seq_cst ret i8 %1 } @@ -2796,15 +3156,25 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i8_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoxor.w.aq a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i8_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoxor.w.aq a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i8_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i8_acquire: ; RV64I: # %bb.0: @@ -2816,15 +3186,25 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i8_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoxor.w.aq a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i8_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoxor.w.aq a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i8_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i8 %b acquire ret i8 %1 } @@ -2840,15 +3220,25 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i8_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoxor.w.rl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i8_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoxor.w.rl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i8_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i8_release: ; RV64I: # %bb.0: @@ -2860,15 +3250,25 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i8_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoxor.w.rl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i8_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoxor.w.rl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i8_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i8 %b release ret i8 %1 } @@ -2884,15 +3284,25 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i8_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoxor.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i8_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoxor.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i8_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i8_acq_rel: ; RV64I: # %bb.0: @@ -2904,15 +3314,25 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i8_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoxor.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i8_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoxor.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i8_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i8 %b acq_rel ret i8 %1 } @@ -2928,15 +3348,25 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i8_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: andi a1, a1, 255 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoxor.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i8_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: andi a1, a1, 255 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoxor.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i8_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: andi a1, a1, 255 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i8_seq_cst: ; RV64I: # %bb.0: @@ -2948,15 +3378,25 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i8_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: andi a1, a1, 255 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoxor.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i8_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: andi a1, a1, 255 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoxor.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i8_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: andi a1, a1, 255 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i8 %b seq_cst ret i8 %1 } @@ -6898,17 +7338,29 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_0_i16_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a2, 16 -; RV32IA-NEXT: addi a2, a2, -1 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: not a2, a2 -; RV32IA-NEXT: amoand.w.aq a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i16_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a2, 16 +; RV32IA-WMO-NEXT: addi a2, a2, -1 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: not a2, a2 +; RV32IA-WMO-NEXT: amoand.w.aq a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i16_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a2, 16 +; RV32IA-TSO-NEXT: addi a2, a2, -1 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: not a2, a2 +; RV32IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_0_i16_acquire: ; RV64I: # %bb.0: @@ -6921,17 +7373,29 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_0_i16_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a2, 16 -; RV64IA-NEXT: addiw a2, a2, -1 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: not a2, a2 -; RV64IA-NEXT: amoand.w.aq a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_0_i16_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a2, 16 +; RV64IA-WMO-NEXT: addiw a2, a2, -1 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: not a2, a2 +; RV64IA-WMO-NEXT: amoand.w.aq a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_0_i16_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a2, 16 +; RV64IA-TSO-NEXT: addiw a2, a2, -1 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: not a2, a2 +; RV64IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 0 acquire ret i16 %1 } @@ -6948,17 +7412,29 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_0_i16_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a2, 16 -; RV32IA-NEXT: addi a2, a2, -1 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: not a2, a2 -; RV32IA-NEXT: amoand.w.rl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i16_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a2, 16 +; RV32IA-WMO-NEXT: addi a2, a2, -1 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: not a2, a2 +; RV32IA-WMO-NEXT: amoand.w.rl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i16_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a2, 16 +; RV32IA-TSO-NEXT: addi a2, a2, -1 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: not a2, a2 +; RV32IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_0_i16_release: ; RV64I: # %bb.0: @@ -6971,17 +7447,29 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_0_i16_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a2, 16 -; RV64IA-NEXT: addiw a2, a2, -1 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: not a2, a2 -; RV64IA-NEXT: amoand.w.rl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_0_i16_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a2, 16 +; RV64IA-WMO-NEXT: addiw a2, a2, -1 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: not a2, a2 +; RV64IA-WMO-NEXT: amoand.w.rl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_0_i16_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a2, 16 +; RV64IA-TSO-NEXT: addiw a2, a2, -1 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: not a2, a2 +; RV64IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 0 release ret i16 %1 } @@ -6998,17 +7486,29 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_0_i16_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a2, 16 -; RV32IA-NEXT: addi a2, a2, -1 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: not a2, a2 -; RV32IA-NEXT: amoand.w.aqrl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i16_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a2, 16 +; RV32IA-WMO-NEXT: addi a2, a2, -1 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: not a2, a2 +; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i16_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a2, 16 +; RV32IA-TSO-NEXT: addi a2, a2, -1 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: not a2, a2 +; RV32IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_0_i16_acq_rel: ; RV64I: # %bb.0: @@ -7021,17 +7521,29 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_0_i16_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a2, 16 -; RV64IA-NEXT: addiw a2, a2, -1 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: not a2, a2 -; RV64IA-NEXT: amoand.w.aqrl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_0_i16_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a2, 16 +; RV64IA-WMO-NEXT: addiw a2, a2, -1 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: not a2, a2 +; RV64IA-WMO-NEXT: amoand.w.aqrl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_0_i16_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a2, 16 +; RV64IA-TSO-NEXT: addiw a2, a2, -1 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: not a2, a2 +; RV64IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 0 acq_rel ret i16 %1 } @@ -7048,17 +7560,29 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_0_i16_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a2, 16 -; RV32IA-NEXT: addi a2, a2, -1 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: not a2, a2 -; RV32IA-NEXT: amoand.w.aqrl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_0_i16_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a2, 16 +; RV32IA-WMO-NEXT: addi a2, a2, -1 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: not a2, a2 +; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_0_i16_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a2, 16 +; RV32IA-TSO-NEXT: addi a2, a2, -1 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: not a2, a2 +; RV32IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_0_i16_seq_cst: ; RV64I: # %bb.0: @@ -7071,17 +7595,29 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_0_i16_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a2, 16 -; RV64IA-NEXT: addiw a2, a2, -1 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: not a2, a2 -; RV64IA-NEXT: amoand.w.aqrl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_0_i16_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a2, 16 +; RV64IA-WMO-NEXT: addiw a2, a2, -1 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: not a2, a2 +; RV64IA-WMO-NEXT: amoand.w.aqrl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_0_i16_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a2, 16 +; RV64IA-TSO-NEXT: addiw a2, a2, -1 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: not a2, a2 +; RV64IA-TSO-NEXT: amoand.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 0 seq_cst ret i16 %1 } @@ -7149,16 +7685,27 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_minus_1_i16_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a2, 16 -; RV32IA-NEXT: addi a2, a2, -1 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: amoor.w.aq a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a2, 16 +; RV32IA-WMO-NEXT: addi a2, a2, -1 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: amoor.w.aq a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a2, 16 +; RV32IA-TSO-NEXT: addi a2, a2, -1 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_minus_1_i16_acquire: ; RV64I: # %bb.0: @@ -7169,19 +7716,30 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind { ; RV64I-NEXT: li a2, 2 ; RV64I-NEXT: call __atomic_exchange_2@plt ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 -; RV64I-NEXT: ret -; -; RV64IA-LABEL: atomicrmw_xchg_minus_1_i16_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a2, 16 -; RV64IA-NEXT: addiw a2, a2, -1 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: amoor.w.aq a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a2, 16 +; RV64IA-WMO-NEXT: addiw a2, a2, -1 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: amoor.w.aq a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a2, 16 +; RV64IA-TSO-NEXT: addiw a2, a2, -1 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 -1 acquire ret i16 %1 } @@ -7199,16 +7757,27 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_minus_1_i16_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a2, 16 -; RV32IA-NEXT: addi a2, a2, -1 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: amoor.w.rl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a2, 16 +; RV32IA-WMO-NEXT: addi a2, a2, -1 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: amoor.w.rl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a2, 16 +; RV32IA-TSO-NEXT: addi a2, a2, -1 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_minus_1_i16_release: ; RV64I: # %bb.0: @@ -7222,16 +7791,27 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_minus_1_i16_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a2, 16 -; RV64IA-NEXT: addiw a2, a2, -1 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: amoor.w.rl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a2, 16 +; RV64IA-WMO-NEXT: addiw a2, a2, -1 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: amoor.w.rl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a2, 16 +; RV64IA-TSO-NEXT: addiw a2, a2, -1 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 -1 release ret i16 %1 } @@ -7249,16 +7829,27 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a2, 16 -; RV32IA-NEXT: addi a2, a2, -1 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: amoor.w.aqrl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a2, 16 +; RV32IA-WMO-NEXT: addi a2, a2, -1 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: amoor.w.aqrl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a2, 16 +; RV32IA-TSO-NEXT: addi a2, a2, -1 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel: ; RV64I: # %bb.0: @@ -7272,16 +7863,27 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a2, 16 -; RV64IA-NEXT: addiw a2, a2, -1 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: amoor.w.aqrl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a2, 16 +; RV64IA-WMO-NEXT: addiw a2, a2, -1 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: amoor.w.aqrl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a2, 16 +; RV64IA-TSO-NEXT: addiw a2, a2, -1 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 -1 acq_rel ret i16 %1 } @@ -7299,16 +7901,27 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a1, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a2, 16 -; RV32IA-NEXT: addi a2, a2, -1 -; RV32IA-NEXT: sll a2, a2, a0 -; RV32IA-NEXT: amoor.w.aqrl a1, a2, (a1) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a1, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a2, 16 +; RV32IA-WMO-NEXT: addi a2, a2, -1 +; RV32IA-WMO-NEXT: sll a2, a2, a0 +; RV32IA-WMO-NEXT: amoor.w.aqrl a1, a2, (a1) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a1, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a2, 16 +; RV32IA-TSO-NEXT: addi a2, a2, -1 +; RV32IA-TSO-NEXT: sll a2, a2, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst: ; RV64I: # %bb.0: @@ -7322,16 +7935,27 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a1, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a2, 16 -; RV64IA-NEXT: addiw a2, a2, -1 -; RV64IA-NEXT: sllw a2, a2, a0 -; RV64IA-NEXT: amoor.w.aqrl a1, a2, (a1) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a1, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a2, 16 +; RV64IA-WMO-NEXT: addiw a2, a2, -1 +; RV64IA-WMO-NEXT: sllw a2, a2, a0 +; RV64IA-WMO-NEXT: amoor.w.aqrl a1, a2, (a1) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a1, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a2, 16 +; RV64IA-TSO-NEXT: addiw a2, a2, -1 +; RV64IA-TSO-NEXT: sllw a2, a2, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a2, (a1) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i16 -1 seq_cst ret i16 %1 } @@ -8313,20 +8937,35 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i16_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a3, 16 -; RV32IA-NEXT: addi a3, a3, -1 -; RV32IA-NEXT: sll a4, a3, a0 -; RV32IA-NEXT: not a4, a4 -; RV32IA-NEXT: and a1, a1, a3 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a4, a1 -; RV32IA-NEXT: amoand.w.aq a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i16_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a3, 16 +; RV32IA-WMO-NEXT: addi a3, a3, -1 +; RV32IA-WMO-NEXT: sll a4, a3, a0 +; RV32IA-WMO-NEXT: not a4, a4 +; RV32IA-WMO-NEXT: and a1, a1, a3 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: or a1, a4, a1 +; RV32IA-WMO-NEXT: amoand.w.aq a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i16_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a3, 16 +; RV32IA-TSO-NEXT: addi a3, a3, -1 +; RV32IA-TSO-NEXT: sll a4, a3, a0 +; RV32IA-TSO-NEXT: not a4, a4 +; RV32IA-TSO-NEXT: and a1, a1, a3 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: or a1, a4, a1 +; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i16_acquire: ; RV64I: # %bb.0: @@ -8338,20 +8977,35 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i16_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a3, 16 -; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: sllw a4, a3, a0 -; RV64IA-NEXT: not a4, a4 -; RV64IA-NEXT: and a1, a1, a3 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a4, a1 -; RV64IA-NEXT: amoand.w.aq a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i16_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a3, 16 +; RV64IA-WMO-NEXT: addiw a3, a3, -1 +; RV64IA-WMO-NEXT: sllw a4, a3, a0 +; RV64IA-WMO-NEXT: not a4, a4 +; RV64IA-WMO-NEXT: and a1, a1, a3 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: or a1, a4, a1 +; RV64IA-WMO-NEXT: amoand.w.aq a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i16_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a3, 16 +; RV64IA-TSO-NEXT: addiw a3, a3, -1 +; RV64IA-TSO-NEXT: sllw a4, a3, a0 +; RV64IA-TSO-NEXT: not a4, a4 +; RV64IA-TSO-NEXT: and a1, a1, a3 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: or a1, a4, a1 +; RV64IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i16 %b acquire ret i16 %1 } @@ -8367,20 +9021,35 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i16_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a3, 16 -; RV32IA-NEXT: addi a3, a3, -1 -; RV32IA-NEXT: sll a4, a3, a0 -; RV32IA-NEXT: not a4, a4 -; RV32IA-NEXT: and a1, a1, a3 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a4, a1 -; RV32IA-NEXT: amoand.w.rl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i16_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a3, 16 +; RV32IA-WMO-NEXT: addi a3, a3, -1 +; RV32IA-WMO-NEXT: sll a4, a3, a0 +; RV32IA-WMO-NEXT: not a4, a4 +; RV32IA-WMO-NEXT: and a1, a1, a3 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: or a1, a4, a1 +; RV32IA-WMO-NEXT: amoand.w.rl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i16_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a3, 16 +; RV32IA-TSO-NEXT: addi a3, a3, -1 +; RV32IA-TSO-NEXT: sll a4, a3, a0 +; RV32IA-TSO-NEXT: not a4, a4 +; RV32IA-TSO-NEXT: and a1, a1, a3 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: or a1, a4, a1 +; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i16_release: ; RV64I: # %bb.0: @@ -8392,20 +9061,35 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i16_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a3, 16 -; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: sllw a4, a3, a0 -; RV64IA-NEXT: not a4, a4 -; RV64IA-NEXT: and a1, a1, a3 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a4, a1 -; RV64IA-NEXT: amoand.w.rl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i16_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a3, 16 +; RV64IA-WMO-NEXT: addiw a3, a3, -1 +; RV64IA-WMO-NEXT: sllw a4, a3, a0 +; RV64IA-WMO-NEXT: not a4, a4 +; RV64IA-WMO-NEXT: and a1, a1, a3 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: or a1, a4, a1 +; RV64IA-WMO-NEXT: amoand.w.rl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i16_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a3, 16 +; RV64IA-TSO-NEXT: addiw a3, a3, -1 +; RV64IA-TSO-NEXT: sllw a4, a3, a0 +; RV64IA-TSO-NEXT: not a4, a4 +; RV64IA-TSO-NEXT: and a1, a1, a3 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: or a1, a4, a1 +; RV64IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i16 %b release ret i16 %1 } @@ -8421,20 +9105,35 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i16_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a3, 16 -; RV32IA-NEXT: addi a3, a3, -1 -; RV32IA-NEXT: sll a4, a3, a0 -; RV32IA-NEXT: not a4, a4 -; RV32IA-NEXT: and a1, a1, a3 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a4, a1 -; RV32IA-NEXT: amoand.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i16_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a3, 16 +; RV32IA-WMO-NEXT: addi a3, a3, -1 +; RV32IA-WMO-NEXT: sll a4, a3, a0 +; RV32IA-WMO-NEXT: not a4, a4 +; RV32IA-WMO-NEXT: and a1, a1, a3 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: or a1, a4, a1 +; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i16_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a3, 16 +; RV32IA-TSO-NEXT: addi a3, a3, -1 +; RV32IA-TSO-NEXT: sll a4, a3, a0 +; RV32IA-TSO-NEXT: not a4, a4 +; RV32IA-TSO-NEXT: and a1, a1, a3 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: or a1, a4, a1 +; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i16_acq_rel: ; RV64I: # %bb.0: @@ -8446,20 +9145,35 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i16_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a3, 16 -; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: sllw a4, a3, a0 -; RV64IA-NEXT: not a4, a4 -; RV64IA-NEXT: and a1, a1, a3 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a4, a1 -; RV64IA-NEXT: amoand.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i16_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a3, 16 +; RV64IA-WMO-NEXT: addiw a3, a3, -1 +; RV64IA-WMO-NEXT: sllw a4, a3, a0 +; RV64IA-WMO-NEXT: not a4, a4 +; RV64IA-WMO-NEXT: and a1, a1, a3 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: or a1, a4, a1 +; RV64IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i16_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a3, 16 +; RV64IA-TSO-NEXT: addiw a3, a3, -1 +; RV64IA-TSO-NEXT: sllw a4, a3, a0 +; RV64IA-TSO-NEXT: not a4, a4 +; RV64IA-TSO-NEXT: and a1, a1, a3 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: or a1, a4, a1 +; RV64IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i16 %b acq_rel ret i16 %1 } @@ -8475,20 +9189,35 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i16_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: lui a3, 16 -; RV32IA-NEXT: addi a3, a3, -1 -; RV32IA-NEXT: sll a4, a3, a0 -; RV32IA-NEXT: not a4, a4 -; RV32IA-NEXT: and a1, a1, a3 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: or a1, a4, a1 -; RV32IA-NEXT: amoand.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i16_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: lui a3, 16 +; RV32IA-WMO-NEXT: addi a3, a3, -1 +; RV32IA-WMO-NEXT: sll a4, a3, a0 +; RV32IA-WMO-NEXT: not a4, a4 +; RV32IA-WMO-NEXT: and a1, a1, a3 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: or a1, a4, a1 +; RV32IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i16_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: lui a3, 16 +; RV32IA-TSO-NEXT: addi a3, a3, -1 +; RV32IA-TSO-NEXT: sll a4, a3, a0 +; RV32IA-TSO-NEXT: not a4, a4 +; RV32IA-TSO-NEXT: and a1, a1, a3 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: or a1, a4, a1 +; RV32IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i16_seq_cst: ; RV64I: # %bb.0: @@ -8500,20 +9229,35 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i16_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: lui a3, 16 -; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: sllw a4, a3, a0 -; RV64IA-NEXT: not a4, a4 -; RV64IA-NEXT: and a1, a1, a3 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: or a1, a4, a1 -; RV64IA-NEXT: amoand.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i16_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: lui a3, 16 +; RV64IA-WMO-NEXT: addiw a3, a3, -1 +; RV64IA-WMO-NEXT: sllw a4, a3, a0 +; RV64IA-WMO-NEXT: not a4, a4 +; RV64IA-WMO-NEXT: and a1, a1, a3 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: or a1, a4, a1 +; RV64IA-WMO-NEXT: amoand.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i16_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: lui a3, 16 +; RV64IA-TSO-NEXT: addiw a3, a3, -1 +; RV64IA-TSO-NEXT: sllw a4, a3, a0 +; RV64IA-TSO-NEXT: not a4, a4 +; RV64IA-TSO-NEXT: and a1, a1, a3 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: or a1, a4, a1 +; RV64IA-TSO-NEXT: amoand.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i16 %b seq_cst ret i16 %1 } @@ -9047,16 +9791,27 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i16_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: slli a1, a1, 16 -; RV32IA-NEXT: srli a1, a1, 16 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoor.w.aq a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i16_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: srli a1, a1, 16 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoor.w.aq a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i16_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: srli a1, a1, 16 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i16_acquire: ; RV64I: # %bb.0: @@ -9068,16 +9823,27 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i16_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: slli a1, a1, 48 -; RV64IA-NEXT: srli a1, a1, 48 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoor.w.aq a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i16_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NEXT: srli a1, a1, 48 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoor.w.aq a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i16_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NEXT: srli a1, a1, 48 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i16 %b acquire ret i16 %1 } @@ -9093,16 +9859,27 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i16_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: slli a1, a1, 16 -; RV32IA-NEXT: srli a1, a1, 16 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoor.w.rl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i16_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: srli a1, a1, 16 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoor.w.rl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i16_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: srli a1, a1, 16 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i16_release: ; RV64I: # %bb.0: @@ -9114,16 +9891,27 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i16_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: slli a1, a1, 48 -; RV64IA-NEXT: srli a1, a1, 48 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoor.w.rl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i16_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NEXT: srli a1, a1, 48 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoor.w.rl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i16_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NEXT: srli a1, a1, 48 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i16 %b release ret i16 %1 } @@ -9139,16 +9927,27 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i16_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: slli a1, a1, 16 -; RV32IA-NEXT: srli a1, a1, 16 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoor.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i16_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: srli a1, a1, 16 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoor.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i16_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: srli a1, a1, 16 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i16_acq_rel: ; RV64I: # %bb.0: @@ -9160,16 +9959,27 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i16_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: slli a1, a1, 48 -; RV64IA-NEXT: srli a1, a1, 48 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoor.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i16_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NEXT: srli a1, a1, 48 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoor.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i16_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NEXT: srli a1, a1, 48 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i16 %b acq_rel ret i16 %1 } @@ -9185,16 +9995,27 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i16_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: slli a1, a1, 16 -; RV32IA-NEXT: srli a1, a1, 16 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoor.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i16_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: srli a1, a1, 16 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoor.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i16_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: srli a1, a1, 16 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i16_seq_cst: ; RV64I: # %bb.0: @@ -9206,16 +10027,27 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i16_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: slli a1, a1, 48 -; RV64IA-NEXT: srli a1, a1, 48 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoor.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i16_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NEXT: srli a1, a1, 48 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoor.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i16_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NEXT: srli a1, a1, 48 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i16 %b seq_cst ret i16 %1 } @@ -9277,16 +10109,27 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i16_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: slli a1, a1, 16 -; RV32IA-NEXT: srli a1, a1, 16 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoxor.w.aq a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i16_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: srli a1, a1, 16 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoxor.w.aq a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i16_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: srli a1, a1, 16 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i16_acquire: ; RV64I: # %bb.0: @@ -9298,16 +10141,27 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i16_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: slli a1, a1, 48 -; RV64IA-NEXT: srli a1, a1, 48 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoxor.w.aq a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i16_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NEXT: srli a1, a1, 48 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoxor.w.aq a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i16_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NEXT: srli a1, a1, 48 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i16 %b acquire ret i16 %1 } @@ -9323,16 +10177,27 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i16_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: slli a1, a1, 16 -; RV32IA-NEXT: srli a1, a1, 16 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoxor.w.rl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i16_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: srli a1, a1, 16 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoxor.w.rl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i16_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: srli a1, a1, 16 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i16_release: ; RV64I: # %bb.0: @@ -9344,16 +10209,27 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i16_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: slli a1, a1, 48 -; RV64IA-NEXT: srli a1, a1, 48 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoxor.w.rl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i16_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NEXT: srli a1, a1, 48 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoxor.w.rl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i16_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NEXT: srli a1, a1, 48 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i16 %b release ret i16 %1 } @@ -9369,16 +10245,27 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i16_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: slli a1, a1, 16 -; RV32IA-NEXT: srli a1, a1, 16 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoxor.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i16_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: srli a1, a1, 16 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoxor.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i16_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: srli a1, a1, 16 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i16_acq_rel: ; RV64I: # %bb.0: @@ -9390,16 +10277,27 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i16_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: slli a1, a1, 48 -; RV64IA-NEXT: srli a1, a1, 48 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoxor.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i16_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NEXT: srli a1, a1, 48 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoxor.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i16_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NEXT: srli a1, a1, 48 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i16 %b acq_rel ret i16 %1 } @@ -9415,16 +10313,27 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i16_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: andi a2, a0, -4 -; RV32IA-NEXT: slli a0, a0, 3 -; RV32IA-NEXT: slli a1, a1, 16 -; RV32IA-NEXT: srli a1, a1, 16 -; RV32IA-NEXT: sll a1, a1, a0 -; RV32IA-NEXT: amoxor.w.aqrl a1, a1, (a2) -; RV32IA-NEXT: srl a0, a1, a0 -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i16_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: andi a2, a0, -4 +; RV32IA-WMO-NEXT: slli a0, a0, 3 +; RV32IA-WMO-NEXT: slli a1, a1, 16 +; RV32IA-WMO-NEXT: srli a1, a1, 16 +; RV32IA-WMO-NEXT: sll a1, a1, a0 +; RV32IA-WMO-NEXT: amoxor.w.aqrl a1, a1, (a2) +; RV32IA-WMO-NEXT: srl a0, a1, a0 +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i16_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: andi a2, a0, -4 +; RV32IA-TSO-NEXT: slli a0, a0, 3 +; RV32IA-TSO-NEXT: slli a1, a1, 16 +; RV32IA-TSO-NEXT: srli a1, a1, 16 +; RV32IA-TSO-NEXT: sll a1, a1, a0 +; RV32IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV32IA-TSO-NEXT: srl a0, a1, a0 +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i16_seq_cst: ; RV64I: # %bb.0: @@ -9436,16 +10345,27 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i16_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a0, a0, 3 -; RV64IA-NEXT: slli a1, a1, 48 -; RV64IA-NEXT: srli a1, a1, 48 -; RV64IA-NEXT: sllw a1, a1, a0 -; RV64IA-NEXT: amoxor.w.aqrl a1, a1, (a2) -; RV64IA-NEXT: srlw a0, a1, a0 -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i16_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: andi a2, a0, -4 +; RV64IA-WMO-NEXT: slli a0, a0, 3 +; RV64IA-WMO-NEXT: slli a1, a1, 48 +; RV64IA-WMO-NEXT: srli a1, a1, 48 +; RV64IA-WMO-NEXT: sllw a1, a1, a0 +; RV64IA-WMO-NEXT: amoxor.w.aqrl a1, a1, (a2) +; RV64IA-WMO-NEXT: srlw a0, a1, a0 +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i16_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: andi a2, a0, -4 +; RV64IA-TSO-NEXT: slli a0, a0, 3 +; RV64IA-TSO-NEXT: slli a1, a1, 48 +; RV64IA-TSO-NEXT: srli a1, a1, 48 +; RV64IA-TSO-NEXT: sllw a1, a1, a0 +; RV64IA-TSO-NEXT: amoxor.w a1, a1, (a2) +; RV64IA-TSO-NEXT: srlw a0, a1, a0 +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i16 %b seq_cst ret i16 %1 } @@ -13087,10 +14007,15 @@ define i32 @atomicrmw_xchg_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoswap.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoswap.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoswap.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_i32_acquire: ; RV64I: # %bb.0: @@ -13102,10 +14027,15 @@ define i32 @atomicrmw_xchg_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoswap.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoswap.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoswap.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i32 %b acquire ret i32 %1 } @@ -13121,10 +14051,15 @@ define i32 @atomicrmw_xchg_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoswap.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoswap.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoswap.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_i32_release: ; RV64I: # %bb.0: @@ -13136,10 +14071,15 @@ define i32 @atomicrmw_xchg_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoswap.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoswap.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoswap.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i32 %b release ret i32 %1 } @@ -13155,10 +14095,15 @@ define i32 @atomicrmw_xchg_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoswap.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoswap.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoswap.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_i32_acq_rel: ; RV64I: # %bb.0: @@ -13170,10 +14115,15 @@ define i32 @atomicrmw_xchg_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoswap.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoswap.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoswap.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i32 %b acq_rel ret i32 %1 } @@ -13189,10 +14139,15 @@ define i32 @atomicrmw_xchg_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xchg_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoswap.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xchg_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoswap.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xchg_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoswap.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xchg_i32_seq_cst: ; RV64I: # %bb.0: @@ -13204,10 +14159,15 @@ define i32 @atomicrmw_xchg_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoswap.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoswap.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoswap.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i32 %b seq_cst ret i32 %1 } @@ -13257,10 +14217,15 @@ define i32 @atomicrmw_add_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_add_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoadd.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_add_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoadd.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_add_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_add_i32_acquire: ; RV64I: # %bb.0: @@ -13272,10 +14237,15 @@ define i32 @atomicrmw_add_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_add_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoadd.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_add_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoadd.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_add_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw add ptr %a, i32 %b acquire ret i32 %1 } @@ -13291,10 +14261,15 @@ define i32 @atomicrmw_add_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_add_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoadd.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_add_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoadd.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_add_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_add_i32_release: ; RV64I: # %bb.0: @@ -13306,10 +14281,15 @@ define i32 @atomicrmw_add_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_add_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoadd.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_add_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoadd.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_add_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw add ptr %a, i32 %b release ret i32 %1 } @@ -13325,10 +14305,15 @@ define i32 @atomicrmw_add_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_add_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoadd.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_add_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_add_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_add_i32_acq_rel: ; RV64I: # %bb.0: @@ -13340,10 +14325,15 @@ define i32 @atomicrmw_add_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_add_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoadd.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_add_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_add_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw add ptr %a, i32 %b acq_rel ret i32 %1 } @@ -13359,10 +14349,15 @@ define i32 @atomicrmw_add_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_add_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoadd.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_add_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_add_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_add_i32_seq_cst: ; RV64I: # %bb.0: @@ -13374,10 +14369,15 @@ define i32 @atomicrmw_add_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_add_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoadd.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_add_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_add_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw add ptr %a, i32 %b seq_cst ret i32 %1 } @@ -13429,11 +14429,17 @@ define i32 @atomicrmw_sub_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_sub_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: neg a1, a1 -; RV32IA-NEXT: amoadd.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_sub_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: neg a1, a1 +; RV32IA-WMO-NEXT: amoadd.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_sub_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: neg a1, a1 +; RV32IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_sub_i32_acquire: ; RV64I: # %bb.0: @@ -13445,11 +14451,17 @@ define i32 @atomicrmw_sub_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_sub_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: neg a1, a1 -; RV64IA-NEXT: amoadd.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_sub_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: neg a1, a1 +; RV64IA-WMO-NEXT: amoadd.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_sub_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: neg a1, a1 +; RV64IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw sub ptr %a, i32 %b acquire ret i32 %1 } @@ -13465,11 +14477,17 @@ define i32 @atomicrmw_sub_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_sub_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: neg a1, a1 -; RV32IA-NEXT: amoadd.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_sub_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: neg a1, a1 +; RV32IA-WMO-NEXT: amoadd.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_sub_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: neg a1, a1 +; RV32IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_sub_i32_release: ; RV64I: # %bb.0: @@ -13481,11 +14499,17 @@ define i32 @atomicrmw_sub_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_sub_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: neg a1, a1 -; RV64IA-NEXT: amoadd.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_sub_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: neg a1, a1 +; RV64IA-WMO-NEXT: amoadd.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_sub_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: neg a1, a1 +; RV64IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw sub ptr %a, i32 %b release ret i32 %1 } @@ -13501,11 +14525,17 @@ define i32 @atomicrmw_sub_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_sub_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: neg a1, a1 -; RV32IA-NEXT: amoadd.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_sub_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: neg a1, a1 +; RV32IA-WMO-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_sub_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: neg a1, a1 +; RV32IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_sub_i32_acq_rel: ; RV64I: # %bb.0: @@ -13517,11 +14547,17 @@ define i32 @atomicrmw_sub_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_sub_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: neg a1, a1 -; RV64IA-NEXT: amoadd.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_sub_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: neg a1, a1 +; RV64IA-WMO-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_sub_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: neg a1, a1 +; RV64IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw sub ptr %a, i32 %b acq_rel ret i32 %1 } @@ -13537,11 +14573,17 @@ define i32 @atomicrmw_sub_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_sub_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: neg a1, a1 -; RV32IA-NEXT: amoadd.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_sub_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: neg a1, a1 +; RV32IA-WMO-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_sub_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: neg a1, a1 +; RV32IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_sub_i32_seq_cst: ; RV64I: # %bb.0: @@ -13553,11 +14595,17 @@ define i32 @atomicrmw_sub_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_sub_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: neg a1, a1 -; RV64IA-NEXT: amoadd.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_sub_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: neg a1, a1 +; RV64IA-WMO-NEXT: amoadd.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_sub_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: neg a1, a1 +; RV64IA-TSO-NEXT: amoadd.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw sub ptr %a, i32 %b seq_cst ret i32 %1 } @@ -13607,10 +14655,15 @@ define i32 @atomicrmw_and_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoand.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoand.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoand.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i32_acquire: ; RV64I: # %bb.0: @@ -13622,10 +14675,15 @@ define i32 @atomicrmw_and_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoand.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoand.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoand.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i32 %b acquire ret i32 %1 } @@ -13641,10 +14699,15 @@ define i32 @atomicrmw_and_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoand.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoand.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoand.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i32_release: ; RV64I: # %bb.0: @@ -13656,10 +14719,15 @@ define i32 @atomicrmw_and_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoand.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoand.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoand.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i32 %b release ret i32 %1 } @@ -13675,10 +14743,15 @@ define i32 @atomicrmw_and_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoand.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoand.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoand.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i32_acq_rel: ; RV64I: # %bb.0: @@ -13690,10 +14763,15 @@ define i32 @atomicrmw_and_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoand.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoand.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoand.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i32 %b acq_rel ret i32 %1 } @@ -13709,10 +14787,15 @@ define i32 @atomicrmw_and_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_and_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoand.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_and_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoand.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_and_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoand.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_and_i32_seq_cst: ; RV64I: # %bb.0: @@ -13724,10 +14807,15 @@ define i32 @atomicrmw_and_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoand.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoand.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoand.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i32 %b seq_cst ret i32 %1 } @@ -14089,10 +15177,15 @@ define i32 @atomicrmw_or_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoor.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoor.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoor.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i32_acquire: ; RV64I: # %bb.0: @@ -14104,10 +15197,15 @@ define i32 @atomicrmw_or_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoor.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoor.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoor.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i32 %b acquire ret i32 %1 } @@ -14123,10 +15221,15 @@ define i32 @atomicrmw_or_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoor.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoor.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoor.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i32_release: ; RV64I: # %bb.0: @@ -14138,10 +15241,15 @@ define i32 @atomicrmw_or_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoor.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoor.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoor.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i32 %b release ret i32 %1 } @@ -14157,10 +15265,15 @@ define i32 @atomicrmw_or_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoor.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoor.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoor.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i32_acq_rel: ; RV64I: # %bb.0: @@ -14172,10 +15285,15 @@ define i32 @atomicrmw_or_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoor.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoor.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoor.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i32 %b acq_rel ret i32 %1 } @@ -14191,10 +15309,15 @@ define i32 @atomicrmw_or_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_or_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoor.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_or_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoor.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_or_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoor.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_or_i32_seq_cst: ; RV64I: # %bb.0: @@ -14206,10 +15329,15 @@ define i32 @atomicrmw_or_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoor.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoor.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoor.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i32 %b seq_cst ret i32 %1 } @@ -14259,10 +15387,15 @@ define i32 @atomicrmw_xor_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoxor.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoxor.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoxor.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i32_acquire: ; RV64I: # %bb.0: @@ -14274,10 +15407,15 @@ define i32 @atomicrmw_xor_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoxor.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoxor.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoxor.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i32 %b acquire ret i32 %1 } @@ -14293,10 +15431,15 @@ define i32 @atomicrmw_xor_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoxor.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoxor.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoxor.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i32_release: ; RV64I: # %bb.0: @@ -14308,10 +15451,15 @@ define i32 @atomicrmw_xor_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoxor.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoxor.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoxor.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i32 %b release ret i32 %1 } @@ -14327,10 +15475,15 @@ define i32 @atomicrmw_xor_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoxor.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoxor.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoxor.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i32_acq_rel: ; RV64I: # %bb.0: @@ -14342,10 +15495,15 @@ define i32 @atomicrmw_xor_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoxor.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoxor.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoxor.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i32 %b acq_rel ret i32 %1 } @@ -14361,10 +15519,15 @@ define i32 @atomicrmw_xor_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_xor_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amoxor.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_xor_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amoxor.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_xor_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amoxor.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_xor_i32_seq_cst: ; RV64I: # %bb.0: @@ -14376,10 +15539,15 @@ define i32 @atomicrmw_xor_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoxor.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoxor.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoxor.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i32 %b seq_cst ret i32 %1 } @@ -14510,10 +15678,15 @@ define i32 @atomicrmw_max_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_max_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomax.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_max_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomax.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_max_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomax.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i32_acquire: ; RV64I: # %bb.0: @@ -14554,10 +15727,15 @@ define i32 @atomicrmw_max_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_max_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomax.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_max_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomax.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_max_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomax.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw max ptr %a, i32 %b acquire ret i32 %1 } @@ -14599,10 +15777,15 @@ define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_max_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomax.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_max_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomax.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_max_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomax.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i32_release: ; RV64I: # %bb.0: @@ -14643,10 +15826,15 @@ define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_max_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomax.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_max_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomax.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_max_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomax.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw max ptr %a, i32 %b release ret i32 %1 } @@ -14688,10 +15876,15 @@ define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_max_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomax.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_max_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_max_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomax.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i32_acq_rel: ; RV64I: # %bb.0: @@ -14732,10 +15925,15 @@ define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_max_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomax.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_max_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_max_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomax.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw max ptr %a, i32 %b acq_rel ret i32 %1 } @@ -14777,10 +15975,15 @@ define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_max_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomax.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomax.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_max_i32_seq_cst: ; RV64I: # %bb.0: @@ -14821,10 +16024,15 @@ define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_max_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomax.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomax.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw max ptr %a, i32 %b seq_cst ret i32 %1 } @@ -14955,10 +16163,15 @@ define i32 @atomicrmw_min_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_min_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomin.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_min_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomin.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_min_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomin.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i32_acquire: ; RV64I: # %bb.0: @@ -14999,10 +16212,15 @@ define i32 @atomicrmw_min_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_min_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomin.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_min_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomin.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_min_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomin.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw min ptr %a, i32 %b acquire ret i32 %1 } @@ -15044,10 +16262,15 @@ define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_min_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomin.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_min_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomin.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_min_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomin.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i32_release: ; RV64I: # %bb.0: @@ -15088,10 +16311,15 @@ define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_min_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomin.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_min_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomin.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_min_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomin.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw min ptr %a, i32 %b release ret i32 %1 } @@ -15133,10 +16361,15 @@ define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_min_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomin.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_min_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_min_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomin.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i32_acq_rel: ; RV64I: # %bb.0: @@ -15177,10 +16410,15 @@ define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_min_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomin.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_min_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_min_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomin.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw min ptr %a, i32 %b acq_rel ret i32 %1 } @@ -15222,10 +16460,15 @@ define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_min_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomin.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomin.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_min_i32_seq_cst: ; RV64I: # %bb.0: @@ -15266,10 +16509,15 @@ define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_min_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomin.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomin.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw min ptr %a, i32 %b seq_cst ret i32 %1 } @@ -15400,10 +16648,15 @@ define i32 @atomicrmw_umax_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_umax_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomaxu.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_umax_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomaxu.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_umax_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomaxu.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umax_i32_acquire: ; RV64I: # %bb.0: @@ -15444,10 +16697,15 @@ define i32 @atomicrmw_umax_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umax_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomaxu.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umax_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomaxu.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umax_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomaxu.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umax ptr %a, i32 %b acquire ret i32 %1 } @@ -15489,10 +16747,15 @@ define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_umax_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomaxu.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_umax_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomaxu.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_umax_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomaxu.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umax_i32_release: ; RV64I: # %bb.0: @@ -15533,10 +16796,15 @@ define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umax_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomaxu.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umax_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomaxu.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umax_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomaxu.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umax ptr %a, i32 %b release ret i32 %1 } @@ -15578,10 +16846,15 @@ define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_umax_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomaxu.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_umax_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_umax_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomaxu.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umax_i32_acq_rel: ; RV64I: # %bb.0: @@ -15622,10 +16895,15 @@ define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umax_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomaxu.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umax_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umax_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomaxu.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umax ptr %a, i32 %b acq_rel ret i32 %1 } @@ -15667,10 +16945,15 @@ define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_umax_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amomaxu.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amomaxu.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umax_i32_seq_cst: ; RV64I: # %bb.0: @@ -15711,10 +16994,15 @@ define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umax_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomaxu.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomaxu.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umax ptr %a, i32 %b seq_cst ret i32 %1 } @@ -15845,10 +17133,15 @@ define i32 @atomicrmw_umin_i32_acquire(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_umin_i32_acquire: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amominu.w.aq a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_umin_i32_acquire: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amominu.w.aq a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_umin_i32_acquire: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amominu.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umin_i32_acquire: ; RV64I: # %bb.0: @@ -15889,10 +17182,15 @@ define i32 @atomicrmw_umin_i32_acquire(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umin_i32_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amominu.w.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umin_i32_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amominu.w.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umin_i32_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amominu.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umin ptr %a, i32 %b acquire ret i32 %1 } @@ -15934,10 +17232,15 @@ define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_umin_i32_release: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amominu.w.rl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_umin_i32_release: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amominu.w.rl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_umin_i32_release: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amominu.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umin_i32_release: ; RV64I: # %bb.0: @@ -15978,10 +17281,15 @@ define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umin_i32_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amominu.w.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umin_i32_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amominu.w.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umin_i32_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amominu.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umin ptr %a, i32 %b release ret i32 %1 } @@ -16023,10 +17331,15 @@ define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_umin_i32_acq_rel: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amominu.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_umin_i32_acq_rel: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_umin_i32_acq_rel: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amominu.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umin_i32_acq_rel: ; RV64I: # %bb.0: @@ -16067,10 +17380,15 @@ define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umin_i32_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amominu.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umin_i32_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umin_i32_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amominu.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umin ptr %a, i32 %b acq_rel ret i32 %1 } @@ -16112,10 +17430,15 @@ define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; -; RV32IA-LABEL: atomicrmw_umin_i32_seq_cst: -; RV32IA: # %bb.0: -; RV32IA-NEXT: amominu.w.aqrl a0, a1, (a0) -; RV32IA-NEXT: ret +; RV32IA-WMO-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IA-WMO: # %bb.0: +; RV32IA-WMO-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV32IA-WMO-NEXT: ret +; +; RV32IA-TSO-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IA-TSO: # %bb.0: +; RV32IA-TSO-NEXT: amominu.w a0, a1, (a0) +; RV32IA-TSO-NEXT: ret ; ; RV64I-LABEL: atomicrmw_umin_i32_seq_cst: ; RV64I: # %bb.0: @@ -16156,10 +17479,15 @@ define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 48 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umin_i32_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amominu.w.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amominu.w a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umin ptr %a, i32 %b seq_cst ret i32 %1 } @@ -16234,10 +17562,15 @@ define i64 @atomicrmw_xchg_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoswap.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoswap.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoswap.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i64 %b acquire ret i64 %1 } @@ -16273,10 +17606,15 @@ define i64 @atomicrmw_xchg_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoswap.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoswap.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoswap.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i64 %b release ret i64 %1 } @@ -16312,10 +17650,15 @@ define i64 @atomicrmw_xchg_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoswap.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoswap.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoswap.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i64 %b acq_rel ret i64 %1 } @@ -16351,10 +17694,15 @@ define i64 @atomicrmw_xchg_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xchg_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoswap.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xchg_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoswap.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xchg_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoswap.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xchg ptr %a, i64 %b seq_cst ret i64 %1 } @@ -16429,10 +17777,15 @@ define i64 @atomicrmw_add_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_add_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoadd.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_add_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoadd.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_add_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoadd.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw add ptr %a, i64 %b acquire ret i64 %1 } @@ -16468,10 +17821,15 @@ define i64 @atomicrmw_add_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_add_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoadd.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_add_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoadd.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_add_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoadd.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw add ptr %a, i64 %b release ret i64 %1 } @@ -16507,10 +17865,15 @@ define i64 @atomicrmw_add_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_add_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoadd.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_add_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoadd.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_add_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoadd.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw add ptr %a, i64 %b acq_rel ret i64 %1 } @@ -16546,10 +17909,15 @@ define i64 @atomicrmw_add_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_add_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoadd.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_add_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoadd.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_add_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoadd.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw add ptr %a, i64 %b seq_cst ret i64 %1 } @@ -16625,11 +17993,17 @@ define i64 @atomicrmw_sub_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_sub_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: neg a1, a1 -; RV64IA-NEXT: amoadd.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_sub_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: neg a1, a1 +; RV64IA-WMO-NEXT: amoadd.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_sub_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: neg a1, a1 +; RV64IA-TSO-NEXT: amoadd.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw sub ptr %a, i64 %b acquire ret i64 %1 } @@ -16665,11 +18039,17 @@ define i64 @atomicrmw_sub_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_sub_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: neg a1, a1 -; RV64IA-NEXT: amoadd.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_sub_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: neg a1, a1 +; RV64IA-WMO-NEXT: amoadd.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_sub_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: neg a1, a1 +; RV64IA-TSO-NEXT: amoadd.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw sub ptr %a, i64 %b release ret i64 %1 } @@ -16705,11 +18085,17 @@ define i64 @atomicrmw_sub_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_sub_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: neg a1, a1 -; RV64IA-NEXT: amoadd.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_sub_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: neg a1, a1 +; RV64IA-WMO-NEXT: amoadd.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_sub_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: neg a1, a1 +; RV64IA-TSO-NEXT: amoadd.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw sub ptr %a, i64 %b acq_rel ret i64 %1 } @@ -16745,11 +18131,17 @@ define i64 @atomicrmw_sub_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_sub_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: neg a1, a1 -; RV64IA-NEXT: amoadd.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_sub_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: neg a1, a1 +; RV64IA-WMO-NEXT: amoadd.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_sub_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: neg a1, a1 +; RV64IA-TSO-NEXT: amoadd.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw sub ptr %a, i64 %b seq_cst ret i64 %1 } @@ -16824,10 +18216,15 @@ define i64 @atomicrmw_and_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoand.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoand.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoand.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i64 %b acquire ret i64 %1 } @@ -16863,10 +18260,15 @@ define i64 @atomicrmw_and_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoand.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoand.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoand.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i64 %b release ret i64 %1 } @@ -16902,10 +18304,15 @@ define i64 @atomicrmw_and_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoand.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoand.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoand.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i64 %b acq_rel ret i64 %1 } @@ -16941,10 +18348,15 @@ define i64 @atomicrmw_and_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_and_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoand.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_and_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoand.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_and_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoand.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw and ptr %a, i64 %b seq_cst ret i64 %1 } @@ -17285,10 +18697,15 @@ define i64 @atomicrmw_or_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoor.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoor.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoor.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i64 %b acquire ret i64 %1 } @@ -17324,10 +18741,15 @@ define i64 @atomicrmw_or_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoor.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoor.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoor.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i64 %b release ret i64 %1 } @@ -17363,10 +18785,15 @@ define i64 @atomicrmw_or_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoor.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoor.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoor.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i64 %b acq_rel ret i64 %1 } @@ -17402,10 +18829,15 @@ define i64 @atomicrmw_or_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_or_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoor.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_or_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoor.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_or_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoor.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw or ptr %a, i64 %b seq_cst ret i64 %1 } @@ -17480,10 +18912,15 @@ define i64 @atomicrmw_xor_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoxor.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoxor.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoxor.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i64 %b acquire ret i64 %1 } @@ -17519,10 +18956,15 @@ define i64 @atomicrmw_xor_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoxor.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoxor.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoxor.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i64 %b release ret i64 %1 } @@ -17558,10 +19000,15 @@ define i64 @atomicrmw_xor_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoxor.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoxor.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoxor.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i64 %b acq_rel ret i64 %1 } @@ -17597,10 +19044,15 @@ define i64 @atomicrmw_xor_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_xor_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amoxor.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_xor_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amoxor.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_xor_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amoxor.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw xor ptr %a, i64 %b seq_cst ret i64 %1 } @@ -17903,10 +19355,15 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_max_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomax.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_max_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomax.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_max_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomax.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw max ptr %a, i64 %b acquire ret i64 %1 } @@ -18056,10 +19513,15 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_max_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomax.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_max_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomax.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_max_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomax.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw max ptr %a, i64 %b release ret i64 %1 } @@ -18209,10 +19671,15 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_max_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomax.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_max_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomax.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_max_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomax.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw max ptr %a, i64 %b acq_rel ret i64 %1 } @@ -18362,10 +19829,15 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_max_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomax.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomax.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomax.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw max ptr %a, i64 %b seq_cst ret i64 %1 } @@ -18668,10 +20140,15 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_min_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomin.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_min_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomin.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_min_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomin.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw min ptr %a, i64 %b acquire ret i64 %1 } @@ -18821,10 +20298,15 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_min_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomin.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_min_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomin.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_min_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomin.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw min ptr %a, i64 %b release ret i64 %1 } @@ -18974,10 +20456,15 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_min_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomin.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_min_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomin.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_min_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomin.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw min ptr %a, i64 %b acq_rel ret i64 %1 } @@ -19127,10 +20614,15 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_min_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomin.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomin.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomin.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw min ptr %a, i64 %b seq_cst ret i64 %1 } @@ -19433,10 +20925,15 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umax_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomaxu.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umax_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomaxu.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umax_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomaxu.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umax ptr %a, i64 %b acquire ret i64 %1 } @@ -19586,10 +21083,15 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umax_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomaxu.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umax_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomaxu.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umax_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomaxu.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umax ptr %a, i64 %b release ret i64 %1 } @@ -19739,10 +21241,15 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umax_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomaxu.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umax_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomaxu.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umax_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomaxu.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umax ptr %a, i64 %b acq_rel ret i64 %1 } @@ -19892,10 +21399,15 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umax_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amomaxu.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amomaxu.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amomaxu.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umax ptr %a, i64 %b seq_cst ret i64 %1 } @@ -20198,10 +21710,15 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umin_i64_acquire: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amominu.d.aq a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umin_i64_acquire: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amominu.d.aq a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umin_i64_acquire: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amominu.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umin ptr %a, i64 %b acquire ret i64 %1 } @@ -20351,10 +21868,15 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umin_i64_release: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amominu.d.rl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umin_i64_release: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amominu.d.rl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umin_i64_release: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amominu.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umin ptr %a, i64 %b release ret i64 %1 } @@ -20504,10 +22026,15 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umin_i64_acq_rel: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amominu.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umin_i64_acq_rel: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amominu.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umin_i64_acq_rel: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amominu.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umin ptr %a, i64 %b acq_rel ret i64 %1 } @@ -20657,10 +22184,15 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV64IA-LABEL: atomicrmw_umin_i64_seq_cst: -; RV64IA: # %bb.0: -; RV64IA-NEXT: amominu.d.aqrl a0, a1, (a0) -; RV64IA-NEXT: ret +; RV64IA-WMO-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IA-WMO: # %bb.0: +; RV64IA-WMO-NEXT: amominu.d.aqrl a0, a1, (a0) +; RV64IA-WMO-NEXT: ret +; +; RV64IA-TSO-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IA-TSO: # %bb.0: +; RV64IA-TSO-NEXT: amominu.d a0, a1, (a0) +; RV64IA-TSO-NEXT: ret %1 = atomicrmw umin ptr %a, i64 %b seq_cst ret i64 %1 } diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll index 9ebc0d2c171ba..4d75a74f06ac2 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll @@ -129,16 +129,16 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: andi a2, a0, -4 ; RV64IA-NEXT: slli a0, a0, 3 ; RV64IA-NEXT: li a3, 255 -; RV64IA-NEXT: sllw a3, a3, a0 -; RV64IA-NEXT: lw a4, 0(a2) +; RV64IA-NEXT: sllw a4, a3, a0 +; RV64IA-NEXT: lw a3, 0(a2) ; RV64IA-NEXT: andi a0, a0, 24 -; RV64IA-NEXT: not a3, a3 +; RV64IA-NEXT: not a4, a4 ; RV64IA-NEXT: andi a1, a1, 255 ; RV64IA-NEXT: .LBB0_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB0_3 Depth 2 -; RV64IA-NEXT: srlw a5, a4, a0 -; RV64IA-NEXT: sext.w a6, a4 +; RV64IA-NEXT: srlw a5, a3, a0 +; RV64IA-NEXT: sext.w a6, a3 ; RV64IA-NEXT: andi a7, a5, 255 ; RV64IA-NEXT: addiw a5, a5, 1 ; RV64IA-NEXT: sltu a7, a7, a1 @@ -146,20 +146,20 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: and a5, a7, a5 ; RV64IA-NEXT: andi a5, a5, 255 ; RV64IA-NEXT: sllw a5, a5, a0 -; RV64IA-NEXT: and a4, a4, a3 -; RV64IA-NEXT: or a5, a4, a5 +; RV64IA-NEXT: and a3, a3, a4 +; RV64IA-NEXT: or a5, a3, a5 ; RV64IA-NEXT: .LBB0_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB0_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a4, (a2) -; RV64IA-NEXT: bne a4, a6, .LBB0_1 +; RV64IA-NEXT: lr.w.aqrl a3, (a2) +; RV64IA-NEXT: bne a3, a6, .LBB0_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB0_3 Depth=2 ; RV64IA-NEXT: sc.w.rl a7, a5, (a2) ; RV64IA-NEXT: bnez a7, .LBB0_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a4, a0 +; RV64IA-NEXT: srlw a0, a3, a0 ; RV64IA-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst ret i8 %result @@ -290,19 +290,19 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-LABEL: atomicrmw_uinc_wrap_i16: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: andi a0, a4, 24 +; RV64IA-NEXT: slli a5, a0, 3 +; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: lui a3, 16 ; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: lw a5, 0(a2) -; RV64IA-NEXT: sllw a4, a3, a4 -; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: lw a4, 0(a2) +; RV64IA-NEXT: sllw a5, a3, a5 +; RV64IA-NEXT: not a5, a5 ; RV64IA-NEXT: and a1, a1, a3 ; RV64IA-NEXT: .LBB1_1: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB1_3 Depth 2 -; RV64IA-NEXT: srlw a6, a5, a0 -; RV64IA-NEXT: sext.w a7, a5 +; RV64IA-NEXT: srlw a6, a4, a0 +; RV64IA-NEXT: sext.w a7, a4 ; RV64IA-NEXT: and t0, a6, a3 ; RV64IA-NEXT: addiw a6, a6, 1 ; RV64IA-NEXT: sltu t0, t0, a1 @@ -310,20 +310,20 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: and a6, a6, a3 ; RV64IA-NEXT: and a6, t0, a6 ; RV64IA-NEXT: sllw a6, a6, a0 -; RV64IA-NEXT: and a5, a5, a4 -; RV64IA-NEXT: or a6, a5, a6 +; RV64IA-NEXT: and a4, a4, a5 +; RV64IA-NEXT: or a6, a4, a6 ; RV64IA-NEXT: .LBB1_3: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB1_1 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a5, (a2) -; RV64IA-NEXT: bne a5, a7, .LBB1_1 +; RV64IA-NEXT: lr.w.aqrl a4, (a2) +; RV64IA-NEXT: bne a4, a7, .LBB1_1 ; RV64IA-NEXT: # %bb.4: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB1_3 Depth=2 ; RV64IA-NEXT: sc.w.rl t0, a6, (a2) ; RV64IA-NEXT: bnez t0, .LBB1_3 ; RV64IA-NEXT: # %bb.5: # %atomicrmw.start ; RV64IA-NEXT: # %bb.2: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a5, a0 +; RV64IA-NEXT: srlw a0, a4, a0 ; RV64IA-NEXT: ret %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst ret i16 %result @@ -776,37 +776,37 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-LABEL: atomicrmw_udec_wrap_i8: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a3, a0, 3 -; RV64IA-NEXT: andi a0, a3, 24 +; RV64IA-NEXT: slli a4, a0, 3 +; RV64IA-NEXT: andi a0, a4, 24 ; RV64IA-NEXT: li a5, 255 -; RV64IA-NEXT: lw a4, 0(a2) -; RV64IA-NEXT: sllw a3, a5, a3 -; RV64IA-NEXT: not a3, a3 +; RV64IA-NEXT: lw a3, 0(a2) +; RV64IA-NEXT: sllw a4, a5, a4 +; RV64IA-NEXT: not a4, a4 ; RV64IA-NEXT: andi a5, a1, 255 ; RV64IA-NEXT: j .LBB4_2 ; RV64IA-NEXT: .LBB4_1: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV64IA-NEXT: sext.w a6, a4 +; RV64IA-NEXT: sext.w a6, a3 ; RV64IA-NEXT: andi a7, a7, 255 ; RV64IA-NEXT: sllw a7, a7, a0 -; RV64IA-NEXT: and a4, a4, a3 -; RV64IA-NEXT: or a7, a4, a7 +; RV64IA-NEXT: and a3, a3, a4 +; RV64IA-NEXT: or a7, a3, a7 ; RV64IA-NEXT: .LBB4_5: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB4_2 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a4, (a2) -; RV64IA-NEXT: bne a4, a6, .LBB4_7 +; RV64IA-NEXT: lr.w.aqrl a3, (a2) +; RV64IA-NEXT: bne a3, a6, .LBB4_7 ; RV64IA-NEXT: # %bb.6: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_5 Depth=2 ; RV64IA-NEXT: sc.w.rl t0, a7, (a2) ; RV64IA-NEXT: bnez t0, .LBB4_5 ; RV64IA-NEXT: .LBB4_7: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB4_2 Depth=1 -; RV64IA-NEXT: beq a4, a6, .LBB4_4 +; RV64IA-NEXT: beq a3, a6, .LBB4_4 ; RV64IA-NEXT: .LBB4_2: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB4_5 Depth 2 -; RV64IA-NEXT: srlw a6, a4, a0 +; RV64IA-NEXT: srlw a6, a3, a0 ; RV64IA-NEXT: andi a7, a6, 255 ; RV64IA-NEXT: seqz t0, a7 ; RV64IA-NEXT: sltu a7, a5, a7 @@ -818,7 +818,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: addi a7, a6, -1 ; RV64IA-NEXT: j .LBB4_1 ; RV64IA-NEXT: .LBB4_4: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a4, a0 +; RV64IA-NEXT: srlw a0, a3, a0 ; RV64IA-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst ret i8 %result @@ -983,38 +983,38 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-LABEL: atomicrmw_udec_wrap_i16: ; RV64IA: # %bb.0: ; RV64IA-NEXT: andi a2, a0, -4 -; RV64IA-NEXT: slli a4, a0, 3 -; RV64IA-NEXT: andi a0, a4, 24 +; RV64IA-NEXT: slli a5, a0, 3 +; RV64IA-NEXT: andi a0, a5, 24 ; RV64IA-NEXT: lui a3, 16 ; RV64IA-NEXT: addiw a3, a3, -1 -; RV64IA-NEXT: lw a5, 0(a2) -; RV64IA-NEXT: sllw a4, a3, a4 -; RV64IA-NEXT: not a4, a4 +; RV64IA-NEXT: lw a4, 0(a2) +; RV64IA-NEXT: sllw a5, a3, a5 +; RV64IA-NEXT: not a5, a5 ; RV64IA-NEXT: and a6, a1, a3 ; RV64IA-NEXT: j .LBB5_2 ; RV64IA-NEXT: .LBB5_1: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV64IA-NEXT: sext.w a7, a5 +; RV64IA-NEXT: sext.w a7, a4 ; RV64IA-NEXT: and t0, t0, a3 ; RV64IA-NEXT: sllw t0, t0, a0 -; RV64IA-NEXT: and a5, a5, a4 -; RV64IA-NEXT: or t0, a5, t0 +; RV64IA-NEXT: and a4, a4, a5 +; RV64IA-NEXT: or t0, a4, t0 ; RV64IA-NEXT: .LBB5_5: # %atomicrmw.start ; RV64IA-NEXT: # Parent Loop BB5_2 Depth=1 ; RV64IA-NEXT: # => This Inner Loop Header: Depth=2 -; RV64IA-NEXT: lr.w.aqrl a5, (a2) -; RV64IA-NEXT: bne a5, a7, .LBB5_7 +; RV64IA-NEXT: lr.w.aqrl a4, (a2) +; RV64IA-NEXT: bne a4, a7, .LBB5_7 ; RV64IA-NEXT: # %bb.6: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_5 Depth=2 ; RV64IA-NEXT: sc.w.rl t1, t0, (a2) ; RV64IA-NEXT: bnez t1, .LBB5_5 ; RV64IA-NEXT: .LBB5_7: # %atomicrmw.start ; RV64IA-NEXT: # in Loop: Header=BB5_2 Depth=1 -; RV64IA-NEXT: beq a5, a7, .LBB5_4 +; RV64IA-NEXT: beq a4, a7, .LBB5_4 ; RV64IA-NEXT: .LBB5_2: # %atomicrmw.start ; RV64IA-NEXT: # =>This Loop Header: Depth=1 ; RV64IA-NEXT: # Child Loop BB5_5 Depth 2 -; RV64IA-NEXT: srlw a7, a5, a0 +; RV64IA-NEXT: srlw a7, a4, a0 ; RV64IA-NEXT: and t0, a7, a3 ; RV64IA-NEXT: seqz t1, t0 ; RV64IA-NEXT: sltu t0, a6, t0 @@ -1026,7 +1026,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: addi t0, a7, -1 ; RV64IA-NEXT: j .LBB5_1 ; RV64IA-NEXT: .LBB5_4: # %atomicrmw.end -; RV64IA-NEXT: srlw a0, a5, a0 +; RV64IA-NEXT: srlw a0, a4, a0 ; RV64IA-NEXT: ret %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst ret i16 %result diff --git a/llvm/test/CodeGen/RISCV/branch-relaxation.ll b/llvm/test/CodeGen/RISCV/branch-relaxation.ll index cbe12187a4110..4f7736e318cae 100644 --- a/llvm/test/CodeGen/RISCV/branch-relaxation.ll +++ b/llvm/test/CodeGen/RISCV/branch-relaxation.ll @@ -1204,28 +1204,28 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t0, 5 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t0, 212(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t1, 216(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, 216(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, 212(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t1, 6 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t1, 204(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t2, 208(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, 208(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, 204(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t2, 7 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t2, 196(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t3, 200(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, 200(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, 196(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s0, 8 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s0, 188(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s1, 192(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s0, 192(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 188(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s1, 9 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s1, 180(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s2, 184(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 184(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, 180(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a0, 10 ; CHECK-RV32-NEXT: #NO_APP @@ -1233,83 +1233,83 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a1, 11 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a1, 168(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a2, 172(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, 172(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, 168(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a2, 12 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a2, 160(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a3, 164(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, 164(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, 160(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a3, 13 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a3, 152(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a4, 156(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, 156(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, 152(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a4, 14 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a4, 144(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a5, 148(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, 148(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, 144(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a5, 15 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a5, 136(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a6, 140(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, 140(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, 136(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a6, 16 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a6, 128(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a7, 132(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, 132(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, 128(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a7, 17 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw a7, 120(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t0, 124(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, 124(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, 120(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s2, 18 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s2, 112(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s3, 116(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, 116(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, 112(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s3, 19 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s3, 104(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s4, 108(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, 108(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, 104(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s4, 20 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s4, 96(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s5, 100(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, 100(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, 96(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s5, 21 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s5, 88(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s6, 92(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, 92(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, 88(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s6, 22 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s6, 80(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s7, 84(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, 84(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, 80(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s7, 23 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s7, 72(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s8, 76(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, 76(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, 72(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s8, 24 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s8, 64(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s9, 68(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, 68(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, 64(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s9, 25 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s9, 56(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s10, 60(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, 60(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, 56(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s10, 26 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw s10, 48(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw s11, 52(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, 52(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s11, 48(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s11, 27 ; CHECK-RV32-NEXT: #NO_APP @@ -1317,13 +1317,13 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t3, 28 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t3, 36(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t4, 40(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, 40(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, 36(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t4, 29 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: sw t4, 28(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw t5, 32(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, 32(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t5, 28(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t5, 30 ; CHECK-RV32-NEXT: #NO_APP @@ -1331,13 +1331,12 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t6, 31 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: mv a2, t6 -; CHECK-RV32-NEXT: mv t6, a1 -; CHECK-RV32-NEXT: sw s0, 20(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, 24(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; CHECK-RV32-NEXT: xor a1, a1, s0 -; CHECK-RV32-NEXT: sw t5, 24(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: sw a2, 16(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: xor a2, t5, a2 +; CHECK-RV32-NEXT: sw t6, 20(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t5, 16(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: xor a2, t5, t6 ; CHECK-RV32-NEXT: or a1, a2, a1 ; CHECK-RV32-NEXT: beqz a1, .LBB4_1 ; CHECK-RV32-NEXT: # %bb.3: @@ -1350,28 +1349,28 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use ra ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t0, 212(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t1, 216(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, 216(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, 212(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t0 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t1, 204(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t2, 208(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, 208(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, 204(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t1 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t2, 196(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t3, 200(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, 200(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, 196(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t2 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s0, 188(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s1, 192(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, 192(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 188(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s0 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s1, 180(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s2, 184(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 184(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, 180(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s1 ; CHECK-RV32-NEXT: #NO_APP @@ -1379,83 +1378,83 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a0 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a1, 168(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a2, 172(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a1, 172(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, 168(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a1 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a2, 160(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a3, 164(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, 164(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, 160(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a2 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a3, 152(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a4, 156(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, 156(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, 152(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a3 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a4, 144(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a5, 148(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, 148(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, 144(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a4 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a5, 136(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a6, 140(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, 140(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, 136(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a5 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a6, 128(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw a7, 132(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, 132(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, 128(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a6 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw a7, 120(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t0, 124(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, 124(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, 120(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a7 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s2, 112(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s3, 116(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, 116(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, 112(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s2 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s3, 104(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s4, 108(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, 108(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, 104(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s3 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s4, 96(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s5, 100(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, 100(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, 96(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s4 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s5, 88(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s6, 92(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, 92(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, 88(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s5 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s6, 80(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s7, 84(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, 84(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, 80(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s6 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s7, 72(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s8, 76(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, 76(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, 72(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s7 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s8, 64(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s9, 68(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, 68(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, 64(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s8 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s9, 56(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s10, 60(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, 60(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, 56(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s9 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw s10, 48(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s11, 52(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, 52(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s11, 48(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s10 ; CHECK-RV32-NEXT: #NO_APP @@ -1463,22 +1462,23 @@ define void @relax_jal_spill_64() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s11 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t3, 36(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t4, 40(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, 40(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, 36(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t3 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t4, 28(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw t5, 32(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, 32(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, 28(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t4 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t5, 24(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, 16(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t6, 24(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t5 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: lw t6, 16(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: lw s0, 20(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t6, 20(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t6 ; CHECK-RV32-NEXT: #NO_APP @@ -1839,46 +1839,46 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t0, -8(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, -4(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t1, -4(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, -8(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t1, 6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t1, -16(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t1, -12(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t2, -12(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, -16(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t2, 7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t2, -24(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t2, -20(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw t3, -20(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, -24(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s0, 8 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s0, -32(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s0, -28(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s1, -28(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, -32(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s1, 9 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s1, -40(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, -36(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: sw s2, -36(a0) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, -40(a0) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a0, 10 ; CHECK-RV32-NEXT: #NO_APP @@ -1890,145 +1890,145 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a3, 1 ; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw a1, -52(a3) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a1, -48(a3) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a2, -48(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, -52(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a2, 12 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a2, -60(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a2, -56(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a3, -56(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, -60(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a3, 13 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a3, -68(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a3, -64(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a4, -64(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, -68(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a4, 14 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a4, -76(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a4, -72(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a5, -72(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, -76(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a5, 15 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a5, -84(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a5, -80(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a6, -80(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, -84(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a6, 16 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a6, -92(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a6, -88(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a7, -88(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, -92(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li a7, 17 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw a7, -100(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw a7, -96(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t0, -96(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t0, -100(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s2, 18 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s2, -108(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s2, -104(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s3, -104(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, -108(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s3, 19 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s3, -116(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s3, -112(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s4, -112(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, -116(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s4, 20 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s4, -124(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s4, -120(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s5, -120(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, -124(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s5, 21 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s5, -132(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s5, -128(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s6, -128(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, -132(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s6, 22 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s6, -140(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s6, -136(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s7, -136(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, -140(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s7, 23 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s7, -148(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s7, -144(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s8, -144(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, -148(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s8, 24 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s8, -156(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s8, -152(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s9, -152(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, -156(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s9, 25 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s9, -164(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s9, -160(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s10, -160(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, -164(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s10, 26 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s10, -172(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s10, -168(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw s11, -168(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s11, -172(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li s11, 27 ; CHECK-RV32-NEXT: #NO_APP @@ -2040,19 +2040,19 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t3, -184(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t3, -180(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t4, -180(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, -184(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t4, 29 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t4, -192(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t4, -188(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: sw t5, -188(a1) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw t5, -192(a1) # 4-byte Folded Spill ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t5, 30 ; CHECK-RV32-NEXT: #NO_APP @@ -2060,19 +2060,20 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: li t6, 31 ; CHECK-RV32-NEXT: #NO_APP -; CHECK-RV32-NEXT: mv a2, t6 -; CHECK-RV32-NEXT: mv t6, a1 -; CHECK-RV32-NEXT: lui a3, 1 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw s0, -204(a3) # 4-byte Folded Spill +; CHECK-RV32-NEXT: lui a2, 1 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: sw s0, -208(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: lui a2, 1 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: sw a1, -196(a2) # 4-byte Folded Spill ; CHECK-RV32-NEXT: xor a1, a1, s0 -; CHECK-RV32-NEXT: lui a3, 1 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw t5, -196(a3) # 4-byte Folded Spill -; CHECK-RV32-NEXT: lui a3, 1 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: sw a2, -200(a3) # 4-byte Folded Spill -; CHECK-RV32-NEXT: xor a2, t5, a2 +; CHECK-RV32-NEXT: lui a2, 1 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: sw t6, -200(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: lui a2, 1 +; CHECK-RV32-NEXT: add a2, sp, a2 +; CHECK-RV32-NEXT: sw t5, -204(a2) # 4-byte Folded Spill +; CHECK-RV32-NEXT: xor a2, t5, t6 ; CHECK-RV32-NEXT: or a1, a2, a1 ; CHECK-RV32-NEXT: beqz a1, .LBB5_1 ; CHECK-RV32-NEXT: # %bb.3: @@ -2087,46 +2088,46 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t0, -8(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, -4(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t1, -4(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, -8(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t0 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t1, -16(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t1, -12(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t2, -12(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, -16(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t1 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t2, -24(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t2, -20(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw t3, -20(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, -24(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s0, -32(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, -28(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s1, -28(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, -32(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s0 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s1, -40(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, -36(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a1, 1 ; CHECK-RV32-NEXT: add a1, sp, a1 -; CHECK-RV32-NEXT: lw s2, -36(a1) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, -40(a1) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s1 ; CHECK-RV32-NEXT: #NO_APP @@ -2138,145 +2139,145 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a1, -52(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a1, -48(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a2, -48(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, -52(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a1 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a2, -60(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a2, -56(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a3, -56(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, -60(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a3, -68(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a3, -64(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a4, -64(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, -68(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a4, -76(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a4, -72(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a5, -72(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, -76(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a4 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a5, -84(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a5, -80(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a6, -80(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, -84(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a6, -92(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a6, -88(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a7, -88(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, -92(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw a7, -100(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw a7, -96(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t0, -96(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t0, -100(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use a7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s2, -108(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s2, -104(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s3, -104(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, -108(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s2 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s3, -116(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s3, -112(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s4, -112(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, -116(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s4, -124(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s4, -120(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s5, -120(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, -124(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s4 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s5, -132(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s5, -128(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s6, -128(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, -132(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s6, -140(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s6, -136(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s7, -136(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, -140(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s6 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s7, -148(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s7, -144(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s8, -144(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, -148(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s7 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s8, -156(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s8, -152(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s9, -152(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, -156(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s8 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s9, -164(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s9, -160(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s10, -160(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, -164(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s9 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s10, -172(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s10, -168(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s11, -168(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s11, -172(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use s10 ; CHECK-RV32-NEXT: #NO_APP @@ -2288,31 +2289,34 @@ define void @relax_jal_spill_64_adjust_spill_slot() { ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t3, -184(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t3, -180(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t4, -180(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, -184(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t3 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t4, -192(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t4, -188(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t5, -188(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, -192(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t4 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw t5, -196(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw t5, -204(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lui a0, 1 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: lw t6, -196(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: #APP ; CHECK-RV32-NEXT: # reg use t5 ; CHECK-RV32-NEXT: #NO_APP ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: lw s0, -204(a0) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s0, -208(a0) # 4-byte Folded Reload ; CHECK-RV32-NEXT: lui a0, 1 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: lw t6, -200(a0) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/branch.ll b/llvm/test/CodeGen/RISCV/branch.ll index 492e47812b80f..578080cd3a240 100644 --- a/llvm/test/CodeGen/RISCV/branch.ll +++ b/llvm/test/CodeGen/RISCV/branch.ll @@ -35,7 +35,7 @@ define void @foo(i32 %a, ptr %b, i1 %c) nounwind { ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: bgeu a0, a3, .LBB0_14 ; RV32I-NEXT: # %bb.10: # %test11 -; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw zero, 0(a1) ; RV32I-NEXT: andi a2, a2, 1 ; RV32I-NEXT: bnez a2, .LBB0_14 ; RV32I-NEXT: # %bb.11: # %test12 @@ -45,7 +45,7 @@ define void @foo(i32 %a, ptr %b, i1 %c) nounwind { ; RV32I-NEXT: lw a0, 0(a1) ; RV32I-NEXT: blez a0, .LBB0_14 ; RV32I-NEXT: # %bb.13: # %test14 -; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw zero, 0(a1) ; RV32I-NEXT: .LBB0_14: # %end ; RV32I-NEXT: ret %val1 = load volatile i32, ptr %b diff --git a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll index 30374c13d60fe..ca290988b58ca 100644 --- a/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll +++ b/llvm/test/CodeGen/RISCV/callee-saved-gprs.ll @@ -50,16 +50,16 @@ define void @callee() nounwind { ; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a7, %hi(var) -; RV32I-NEXT: lw a0, %lo(var)(a7) +; RV32I-NEXT: lui a6, %hi(var) +; RV32I-NEXT: lw a0, %lo(var)(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+4)(a7) +; RV32I-NEXT: lw a0, %lo(var+4)(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+8)(a7) +; RV32I-NEXT: lw a0, %lo(var+8)(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var+12)(a7) +; RV32I-NEXT: lw a0, %lo(var+12)(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var) +; RV32I-NEXT: addi a5, a6, %lo(var) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -84,7 +84,7 @@ define void @callee() nounwind { ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a7, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -95,7 +95,7 @@ define void @callee() nounwind { ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw a7, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -121,13 +121,13 @@ define void @callee() nounwind { ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+12)(a7) +; RV32I-NEXT: sw a0, %lo(var+12)(a6) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+8)(a7) +; RV32I-NEXT: sw a0, %lo(var+8)(a6) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var+4)(a7) +; RV32I-NEXT: sw a0, %lo(var+4)(a6) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var)(a7) +; RV32I-NEXT: sw a0, %lo(var)(a6) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -161,16 +161,16 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: addi s0, sp, 80 -; RV32I-WITH-FP-NEXT: lui a7, %hi(var) -; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a7) +; RV32I-WITH-FP-NEXT: lui a6, %hi(var) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV32I-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) ; RV32I-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) ; RV32I-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) +; RV32I-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) ; RV32I-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32I-WITH-FP-NEXT: addi a5, a7, %lo(var) +; RV32I-WITH-FP-NEXT: addi a5, a6, %lo(var) ; RV32I-WITH-FP-NEXT: lw a0, 16(a5) ; RV32I-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill ; RV32I-WITH-FP-NEXT: lw a0, 20(a5) @@ -196,7 +196,7 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: lw s11, 92(a5) ; RV32I-WITH-FP-NEXT: lw ra, 96(a5) ; RV32I-WITH-FP-NEXT: lw t0, 100(a5) -; RV32I-WITH-FP-NEXT: lw a6, 104(a5) +; RV32I-WITH-FP-NEXT: lw a7, 104(a5) ; RV32I-WITH-FP-NEXT: lw a4, 108(a5) ; RV32I-WITH-FP-NEXT: lw a0, 124(a5) ; RV32I-WITH-FP-NEXT: lw a1, 120(a5) @@ -207,7 +207,7 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: sw a2, 116(a5) ; RV32I-WITH-FP-NEXT: sw a3, 112(a5) ; RV32I-WITH-FP-NEXT: sw a4, 108(a5) -; RV32I-WITH-FP-NEXT: sw a6, 104(a5) +; RV32I-WITH-FP-NEXT: sw a7, 104(a5) ; RV32I-WITH-FP-NEXT: sw t0, 100(a5) ; RV32I-WITH-FP-NEXT: sw ra, 96(a5) ; RV32I-WITH-FP-NEXT: sw s11, 92(a5) @@ -234,13 +234,13 @@ define void @callee() nounwind { ; RV32I-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: sw a0, 16(a5) ; RV32I-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) ; RV32I-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) ; RV32I-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) ; RV32I-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a7) +; RV32I-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV32I-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -260,16 +260,16 @@ define void @callee() nounwind { ; RV32IZCMP-LABEL: callee: ; RV32IZCMP: # %bb.0: ; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-NEXT: lui a7, %hi(var) -; RV32IZCMP-NEXT: lw a0, %lo(var)(a7) +; RV32IZCMP-NEXT: lui a6, %hi(var) +; RV32IZCMP-NEXT: lw a0, %lo(var)(a6) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+4)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var+4)(a6) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+8)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var+8)(a6) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var+12)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var+12)(a6) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a7, %lo(var) +; RV32IZCMP-NEXT: addi a5, a6, %lo(var) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -294,7 +294,7 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a7, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -305,7 +305,7 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw a7, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -331,13 +331,13 @@ define void @callee() nounwind { ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+12)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var+12)(a6) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+8)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var+8)(a6) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var+4)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var+4)(a6) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var)(a6) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV32IZCMP-WITH-FP-LABEL: callee: @@ -357,16 +357,16 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: sw s11, 28(sp) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: addi s0, sp, 80 -; RV32IZCMP-WITH-FP-NEXT: lui a7, %hi(var) -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a7) +; RV32IZCMP-WITH-FP-NEXT: lui a6, %hi(var) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -56(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -60(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -64(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) +; RV32IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -68(s0) # 4-byte Folded Spill -; RV32IZCMP-WITH-FP-NEXT: addi a5, a7, %lo(var) +; RV32IZCMP-WITH-FP-NEXT: addi a5, a6, %lo(var) ; RV32IZCMP-WITH-FP-NEXT: lw a0, 16(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a0, -72(s0) # 4-byte Folded Spill ; RV32IZCMP-WITH-FP-NEXT: lw a0, 20(a5) @@ -392,7 +392,7 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: lw s1, 92(a5) ; RV32IZCMP-WITH-FP-NEXT: lw t1, 96(a5) ; RV32IZCMP-WITH-FP-NEXT: lw t0, 100(a5) -; RV32IZCMP-WITH-FP-NEXT: lw a6, 104(a5) +; RV32IZCMP-WITH-FP-NEXT: lw a7, 104(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a4, 108(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a0, 124(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a1, 120(a5) @@ -403,7 +403,7 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: sw a2, 116(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a3, 112(a5) ; RV32IZCMP-WITH-FP-NEXT: sw a4, 108(a5) -; RV32IZCMP-WITH-FP-NEXT: sw a6, 104(a5) +; RV32IZCMP-WITH-FP-NEXT: sw a7, 104(a5) ; RV32IZCMP-WITH-FP-NEXT: sw t0, 100(a5) ; RV32IZCMP-WITH-FP-NEXT: sw t1, 96(a5) ; RV32IZCMP-WITH-FP-NEXT: sw s1, 92(a5) @@ -430,13 +430,13 @@ define void @callee() nounwind { ; RV32IZCMP-WITH-FP-NEXT: lw a0, -72(s0) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: sw a0, 16(a5) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -68(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -64(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -60(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw a0, -56(s0) # 4-byte Folded Reload -; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a7) +; RV32IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV32IZCMP-WITH-FP-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32IZCMP-WITH-FP-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -469,16 +469,16 @@ define void @callee() nounwind { ; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a7, %hi(var) -; RV64I-NEXT: lw a0, %lo(var)(a7) +; RV64I-NEXT: lui a6, %hi(var) +; RV64I-NEXT: lw a0, %lo(var)(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+4)(a7) +; RV64I-NEXT: lw a0, %lo(var+4)(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+8)(a7) +; RV64I-NEXT: lw a0, %lo(var+8)(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var+12)(a7) +; RV64I-NEXT: lw a0, %lo(var+12)(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var) +; RV64I-NEXT: addi a5, a6, %lo(var) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -503,7 +503,7 @@ define void @callee() nounwind { ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a7, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -514,7 +514,7 @@ define void @callee() nounwind { ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw a7, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -540,13 +540,13 @@ define void @callee() nounwind { ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+12)(a7) +; RV64I-NEXT: sw a0, %lo(var+12)(a6) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+8)(a7) +; RV64I-NEXT: sw a0, %lo(var+8)(a6) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var+4)(a7) +; RV64I-NEXT: sw a0, %lo(var+4)(a6) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var)(a7) +; RV64I-NEXT: sw a0, %lo(var)(a6) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -580,16 +580,16 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: sd s11, 56(sp) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: addi s0, sp, 160 -; RV64I-WITH-FP-NEXT: lui a7, %hi(var) -; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a7) +; RV64I-WITH-FP-NEXT: lui a6, %hi(var) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV64I-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) ; RV64I-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) ; RV64I-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) +; RV64I-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) ; RV64I-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64I-WITH-FP-NEXT: addi a5, a7, %lo(var) +; RV64I-WITH-FP-NEXT: addi a5, a6, %lo(var) ; RV64I-WITH-FP-NEXT: lw a0, 16(a5) ; RV64I-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill ; RV64I-WITH-FP-NEXT: lw a0, 20(a5) @@ -615,7 +615,7 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: lw s11, 92(a5) ; RV64I-WITH-FP-NEXT: lw ra, 96(a5) ; RV64I-WITH-FP-NEXT: lw t0, 100(a5) -; RV64I-WITH-FP-NEXT: lw a6, 104(a5) +; RV64I-WITH-FP-NEXT: lw a7, 104(a5) ; RV64I-WITH-FP-NEXT: lw a4, 108(a5) ; RV64I-WITH-FP-NEXT: lw a0, 124(a5) ; RV64I-WITH-FP-NEXT: lw a1, 120(a5) @@ -626,7 +626,7 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: sw a2, 116(a5) ; RV64I-WITH-FP-NEXT: sw a3, 112(a5) ; RV64I-WITH-FP-NEXT: sw a4, 108(a5) -; RV64I-WITH-FP-NEXT: sw a6, 104(a5) +; RV64I-WITH-FP-NEXT: sw a7, 104(a5) ; RV64I-WITH-FP-NEXT: sw t0, 100(a5) ; RV64I-WITH-FP-NEXT: sw ra, 96(a5) ; RV64I-WITH-FP-NEXT: sw s11, 92(a5) @@ -653,13 +653,13 @@ define void @callee() nounwind { ; RV64I-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: sw a0, 16(a5) ; RV64I-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) ; RV64I-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) ; RV64I-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) ; RV64I-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a7) +; RV64I-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV64I-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-WITH-FP-NEXT: ld s1, 136(sp) # 8-byte Folded Reload @@ -679,16 +679,16 @@ define void @callee() nounwind { ; RV64IZCMP-LABEL: callee: ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-NEXT: lui a7, %hi(var) -; RV64IZCMP-NEXT: lw a0, %lo(var)(a7) +; RV64IZCMP-NEXT: lui a6, %hi(var) +; RV64IZCMP-NEXT: lw a0, %lo(var)(a6) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+4)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var+4)(a6) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+8)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var+8)(a6) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var+12)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var+12)(a6) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a7, %lo(var) +; RV64IZCMP-NEXT: addi a5, a6, %lo(var) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -713,7 +713,7 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a7, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -724,7 +724,7 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw a7, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -750,13 +750,13 @@ define void @callee() nounwind { ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+12)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var+12)(a6) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+8)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var+8)(a6) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var+4)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var+4)(a6) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var)(a6) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV64IZCMP-WITH-FP-LABEL: callee: @@ -776,16 +776,16 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: sd s11, 56(sp) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: addi s0, sp, 160 -; RV64IZCMP-WITH-FP-NEXT: lui a7, %hi(var) -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a7) +; RV64IZCMP-WITH-FP-NEXT: lui a6, %hi(var) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -112(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a7) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+4)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -120(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a7) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+8)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -128(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a7) +; RV64IZCMP-WITH-FP-NEXT: lw a0, %lo(var+12)(a6) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -136(s0) # 8-byte Folded Spill -; RV64IZCMP-WITH-FP-NEXT: addi a5, a7, %lo(var) +; RV64IZCMP-WITH-FP-NEXT: addi a5, a6, %lo(var) ; RV64IZCMP-WITH-FP-NEXT: lw a0, 16(a5) ; RV64IZCMP-WITH-FP-NEXT: sd a0, -144(s0) # 8-byte Folded Spill ; RV64IZCMP-WITH-FP-NEXT: lw a0, 20(a5) @@ -811,7 +811,7 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: lw s1, 92(a5) ; RV64IZCMP-WITH-FP-NEXT: lw t1, 96(a5) ; RV64IZCMP-WITH-FP-NEXT: lw t0, 100(a5) -; RV64IZCMP-WITH-FP-NEXT: lw a6, 104(a5) +; RV64IZCMP-WITH-FP-NEXT: lw a7, 104(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a4, 108(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a0, 124(a5) ; RV64IZCMP-WITH-FP-NEXT: lw a1, 120(a5) @@ -822,7 +822,7 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: sw a2, 116(a5) ; RV64IZCMP-WITH-FP-NEXT: sw a3, 112(a5) ; RV64IZCMP-WITH-FP-NEXT: sw a4, 108(a5) -; RV64IZCMP-WITH-FP-NEXT: sw a6, 104(a5) +; RV64IZCMP-WITH-FP-NEXT: sw a7, 104(a5) ; RV64IZCMP-WITH-FP-NEXT: sw t0, 100(a5) ; RV64IZCMP-WITH-FP-NEXT: sw t1, 96(a5) ; RV64IZCMP-WITH-FP-NEXT: sw s1, 92(a5) @@ -849,13 +849,13 @@ define void @callee() nounwind { ; RV64IZCMP-WITH-FP-NEXT: ld a0, -144(s0) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: sw a0, 16(a5) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -136(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a7) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+12)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -128(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a7) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+8)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -120(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a7) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var+4)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld a0, -112(s0) # 8-byte Folded Reload -; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a7) +; RV64IZCMP-WITH-FP-NEXT: sw a0, %lo(var)(a6) ; RV64IZCMP-WITH-FP-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64IZCMP-WITH-FP-NEXT: ld s1, 136(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/double-mem.ll b/llvm/test/CodeGen/RISCV/double-mem.ll index b3f31730e7119..fb043f44beb39 100644 --- a/llvm/test/CodeGen/RISCV/double-mem.ll +++ b/llvm/test/CodeGen/RISCV/double-mem.ll @@ -136,10 +136,10 @@ define dso_local double @fld_fsd_global(double %a, double %b) nounwind { ; RV64IZFINXZDINX: # %bb.0: ; RV64IZFINXZDINX-NEXT: fadd.d a0, a0, a1 ; RV64IZFINXZDINX-NEXT: lui a1, %hi(G) -; RV64IZFINXZDINX-NEXT: ld a2, %lo(G)(a1) +; RV64IZFINXZDINX-NEXT: ld zero, %lo(G)(a1) ; RV64IZFINXZDINX-NEXT: addi a2, a1, %lo(G) ; RV64IZFINXZDINX-NEXT: sd a0, %lo(G)(a1) -; RV64IZFINXZDINX-NEXT: ld a1, 72(a2) +; RV64IZFINXZDINX-NEXT: ld zero, 72(a2) ; RV64IZFINXZDINX-NEXT: sd a0, 72(a2) ; RV64IZFINXZDINX-NEXT: ret ; Use %a and %b in an FP op to ensure floating point registers are used, even diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll index 9d4ce80b0d544..066b6fe9c5348 100644 --- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -24,31 +24,31 @@ define void @_Z3foov() { ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_49) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_49) ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_48) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_48) -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v10, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46) -; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_45) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45) -; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v12, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vs2r.v v14, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vs2r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/float-mem.ll b/llvm/test/CodeGen/RISCV/float-mem.ll index 8ef4f9162c2fe..b5d5f8e7c7e69 100644 --- a/llvm/test/CodeGen/RISCV/float-mem.ll +++ b/llvm/test/CodeGen/RISCV/float-mem.ll @@ -75,10 +75,10 @@ define dso_local float @flw_fsw_global(float %a, float %b) nounwind { ; CHECKIZFINX: # %bb.0: ; CHECKIZFINX-NEXT: fadd.s a0, a0, a1 ; CHECKIZFINX-NEXT: lui a1, %hi(G) -; CHECKIZFINX-NEXT: lw a2, %lo(G)(a1) +; CHECKIZFINX-NEXT: lw zero, %lo(G)(a1) ; CHECKIZFINX-NEXT: addi a2, a1, %lo(G) ; CHECKIZFINX-NEXT: sw a0, %lo(G)(a1) -; CHECKIZFINX-NEXT: lw a1, 36(a2) +; CHECKIZFINX-NEXT: lw zero, 36(a2) ; CHECKIZFINX-NEXT: sw a0, 36(a2) ; CHECKIZFINX-NEXT: ret %1 = fadd float %a, %b diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index e3684af4f4de2..b091b0613c0f3 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -3148,19 +3148,19 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IF-NEXT: lw a0, 8(sp) ; RV32IF-NEXT: lw a1, 12(sp) ; RV32IF-NEXT: lw a2, 20(sp) -; RV32IF-NEXT: lw a4, 16(sp) +; RV32IF-NEXT: lw a3, 16(sp) ; RV32IF-NEXT: beqz a2, .LBB47_2 ; RV32IF-NEXT: # %bb.1: # %entry -; RV32IF-NEXT: slti a3, a2, 0 +; RV32IF-NEXT: slti a4, a2, 0 ; RV32IF-NEXT: j .LBB47_3 ; RV32IF-NEXT: .LBB47_2: -; RV32IF-NEXT: seqz a3, a4 +; RV32IF-NEXT: seqz a4, a3 ; RV32IF-NEXT: .LBB47_3: # %entry -; RV32IF-NEXT: xori a4, a4, 1 -; RV32IF-NEXT: or a4, a4, a2 -; RV32IF-NEXT: seqz a4, a4 -; RV32IF-NEXT: addi a4, a4, -1 -; RV32IF-NEXT: and a3, a4, a3 +; RV32IF-NEXT: xori a3, a3, 1 +; RV32IF-NEXT: or a3, a3, a2 +; RV32IF-NEXT: seqz a3, a3 +; RV32IF-NEXT: addi a3, a3, -1 +; RV32IF-NEXT: and a3, a3, a4 ; RV32IF-NEXT: neg a3, a3 ; RV32IF-NEXT: and a1, a3, a1 ; RV32IF-NEXT: and a0, a3, a0 @@ -3206,19 +3206,19 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IFD-NEXT: lw a0, 8(sp) ; RV32IFD-NEXT: lw a1, 12(sp) ; RV32IFD-NEXT: lw a2, 20(sp) -; RV32IFD-NEXT: lw a4, 16(sp) +; RV32IFD-NEXT: lw a3, 16(sp) ; RV32IFD-NEXT: beqz a2, .LBB47_2 ; RV32IFD-NEXT: # %bb.1: # %entry -; RV32IFD-NEXT: slti a3, a2, 0 +; RV32IFD-NEXT: slti a4, a2, 0 ; RV32IFD-NEXT: j .LBB47_3 ; RV32IFD-NEXT: .LBB47_2: -; RV32IFD-NEXT: seqz a3, a4 +; RV32IFD-NEXT: seqz a4, a3 ; RV32IFD-NEXT: .LBB47_3: # %entry -; RV32IFD-NEXT: xori a4, a4, 1 -; RV32IFD-NEXT: or a4, a4, a2 -; RV32IFD-NEXT: seqz a4, a4 -; RV32IFD-NEXT: addi a4, a4, -1 -; RV32IFD-NEXT: and a3, a4, a3 +; RV32IFD-NEXT: xori a3, a3, 1 +; RV32IFD-NEXT: or a3, a3, a2 +; RV32IFD-NEXT: seqz a3, a3 +; RV32IFD-NEXT: addi a3, a3, -1 +; RV32IFD-NEXT: and a3, a3, a4 ; RV32IFD-NEXT: neg a3, a3 ; RV32IFD-NEXT: and a1, a3, a1 ; RV32IFD-NEXT: and a0, a3, a0 @@ -3374,19 +3374,19 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV32-NEXT: lw a0, 8(sp) ; RV32-NEXT: lw a1, 12(sp) ; RV32-NEXT: lw a2, 20(sp) -; RV32-NEXT: lw a4, 16(sp) +; RV32-NEXT: lw a3, 16(sp) ; RV32-NEXT: beqz a2, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a3, a2, 0 +; RV32-NEXT: slti a4, a2, 0 ; RV32-NEXT: j .LBB50_3 ; RV32-NEXT: .LBB50_2: -; RV32-NEXT: seqz a3, a4 +; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB50_3: # %entry -; RV32-NEXT: xori a4, a4, 1 -; RV32-NEXT: or a4, a4, a2 -; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: addi a4, a4, -1 -; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: xori a3, a3, 1 +; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: seqz a3, a3 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 @@ -3599,19 +3599,19 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV32-NEXT: lw a0, 8(sp) ; RV32-NEXT: lw a1, 12(sp) ; RV32-NEXT: lw a2, 20(sp) -; RV32-NEXT: lw a4, 16(sp) +; RV32-NEXT: lw a3, 16(sp) ; RV32-NEXT: beqz a2, .LBB53_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a3, a2, 0 +; RV32-NEXT: slti a4, a2, 0 ; RV32-NEXT: j .LBB53_3 ; RV32-NEXT: .LBB53_2: -; RV32-NEXT: seqz a3, a4 +; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB53_3: # %entry -; RV32-NEXT: xori a4, a4, 1 -; RV32-NEXT: or a4, a4, a2 -; RV32-NEXT: seqz a4, a4 -; RV32-NEXT: addi a4, a4, -1 -; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: xori a3, a3, 1 +; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: seqz a3, a3 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 diff --git a/llvm/test/CodeGen/RISCV/half-mem.ll b/llvm/test/CodeGen/RISCV/half-mem.ll index 6688250999a75..35b129dfcdb78 100644 --- a/llvm/test/CodeGen/RISCV/half-mem.ll +++ b/llvm/test/CodeGen/RISCV/half-mem.ll @@ -128,10 +128,10 @@ define half @flh_fsh_global(half %a, half %b) nounwind { ; CHECKIZHINX-NEXT: fadd.h a0, a0, a1 ; CHECKIZHINX-NEXT: lui a1, %hi(G) ; CHECKIZHINX-NEXT: addi a1, a1, %lo(G) -; CHECKIZHINX-NEXT: lh a2, 0(a1) +; CHECKIZHINX-NEXT: lh zero, 0(a1) ; CHECKIZHINX-NEXT: sh a0, 0(a1) ; CHECKIZHINX-NEXT: addi a1, a1, 18 -; CHECKIZHINX-NEXT: lh a2, 0(a1) +; CHECKIZHINX-NEXT: lh zero, 0(a1) ; CHECKIZHINX-NEXT: sh a0, 0(a1) ; CHECKIZHINX-NEXT: ret ; @@ -157,10 +157,10 @@ define half @flh_fsh_global(half %a, half %b) nounwind { ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: lui a1, %hi(G) ; CHECKIZHINXMIN-NEXT: addi a1, a1, %lo(G) -; CHECKIZHINXMIN-NEXT: lh a2, 0(a1) +; CHECKIZHINXMIN-NEXT: lh zero, 0(a1) ; CHECKIZHINXMIN-NEXT: sh a0, 0(a1) ; CHECKIZHINXMIN-NEXT: addi a1, a1, 18 -; CHECKIZHINXMIN-NEXT: lh a2, 0(a1) +; CHECKIZHINXMIN-NEXT: lh zero, 0(a1) ; CHECKIZHINXMIN-NEXT: sh a0, 0(a1) ; CHECKIZHINXMIN-NEXT: ret %1 = fadd half %a, %b diff --git a/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll b/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll index a74572518cd03..eb0e0287d48d7 100644 --- a/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll +++ b/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll @@ -182,7 +182,7 @@ define void @constraint_m_with_global_2() nounwind { ; RV64I-MEDIUM-NEXT: sw zero, 4(a0) ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: ret - call void asm "sw zero, $0", "=*m"(ptr nonnull elementtype(i32) getelementptr inbounds ([400000 x i32], ptr @eg, i32 0, i32 1)) + call void asm "sw zero, $0", "=*m"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 1)) ret void } @@ -224,7 +224,7 @@ define void @constraint_m_with_global_3() nounwind { ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: ret - call void asm "sw zero, $0", "=*m"(ptr nonnull elementtype(i32) getelementptr inbounds ([400000 x i32], ptr @eg, i32 0, i32 2000)) + call void asm "sw zero, $0", "=*m"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 2000)) ret void } @@ -306,7 +306,7 @@ define void @constraint_m_with_extern_weak_global_2() nounwind { ; RV64I-MEDIUM-NEXT: sw zero, 4(a0) ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: ret - call void asm "sw zero, $0", "=*m"(ptr nonnull elementtype(i32) getelementptr inbounds ([400000 x i32], ptr @ewg, i32 0, i32 1)) + call void asm "sw zero, $0", "=*m"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @ewg, i32 0, i32 1)) ret void } @@ -354,7 +354,167 @@ define void @constraint_m_with_extern_weak_global_3() nounwind { ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: ret - call void asm "sw zero, $0", "=*m"(ptr nonnull elementtype(i32) getelementptr inbounds ([400000 x i32], ptr @ewg, i32 0, i32 2000)) + call void asm "sw zero, $0", "=*m"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @ewg, i32 0, i32 2000)) + ret void +} + +define void @constraint_m_with_local_1() nounwind { +; RV32I-LABEL: constraint_m_with_local_1: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp0: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp0) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, %lo(.Ltmp0)(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_m_with_local_1: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp0: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp0) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, %lo(.Ltmp0)(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_m_with_local_1: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp0: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi6: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp0) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi6) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_m_with_local_1: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp0: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi6: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp0) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi6) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + tail call void asm sideeffect "lw zero, $0", "*m"(ptr elementtype(ptr) blockaddress(@constraint_m_with_local_1, %label)) + ret void +} + +define void @constraint_m_with_local_2() nounwind { +; RV32I-LABEL: constraint_m_with_local_2: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp1: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp1) +; RV32I-NEXT: addi a0, a0, %lo(.Ltmp1) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, 4(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_m_with_local_2: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp1: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp1) +; RV64I-NEXT: addi a0, a0, %lo(.Ltmp1) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, 4(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_m_with_local_2: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp1: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi7: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp1) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi7) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 4(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_m_with_local_2: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp1: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi7: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp1) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi7) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 4(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + call void asm "lw zero, $0", "*m"(ptr elementtype(i32) getelementptr (i8, ptr blockaddress(@constraint_m_with_local_2, %label), i32 4)) + ret void +} + +define void @constraint_m_with_local_3() nounwind { +; RV32I-LABEL: constraint_m_with_local_3: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp2: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp2) +; RV32I-NEXT: addi a0, a0, %lo(.Ltmp2) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, 2000(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_m_with_local_3: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp2: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp2) +; RV64I-NEXT: addi a0, a0, %lo(.Ltmp2) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, 2000(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_m_with_local_3: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp2: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi8: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp2) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi8) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 2000(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_m_with_local_3: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp2: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi8: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp2) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi8) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 2000(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + call void asm "lw zero, $0", "*m"(ptr elementtype(i32) getelementptr (i8, ptr blockaddress(@constraint_m_with_local_3, %label), i32 2000)) ret void } @@ -377,9 +537,9 @@ define void @constraint_m_with_multi_operands() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_m_with_multi_operands: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi6: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi9: ; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi6) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi9) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 0(a0); sw zero, 0(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -387,9 +547,9 @@ define void @constraint_m_with_multi_operands() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_m_with_multi_operands: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi6: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi9: ; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi6) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi9) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 0(a0); sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP @@ -423,9 +583,9 @@ define void @constraint_m_with_multi_asm() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_m_with_multi_asm: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi7: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi10: ; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi7) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi10) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 0(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -436,9 +596,9 @@ define void @constraint_m_with_multi_asm() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_m_with_multi_asm: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi7: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi10: ; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi7) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi10) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP @@ -456,12 +616,12 @@ define i32 @constraint_m_with_callbr_multi_operands(i32 %a) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, %hi(eg) ; RV32I-NEXT: #APP -; RV32I-NEXT: sw zero, %lo(eg)(a1); sw zero, %lo(eg)(a1); beqz a0, .LBB11_2 +; RV32I-NEXT: sw zero, %lo(eg)(a1); sw zero, %lo(eg)(a1); beqz a0, .LBB14_2 ; RV32I-NEXT: #NO_APP ; RV32I-NEXT: # %bb.1: # %normal ; RV32I-NEXT: li a0, 0 ; RV32I-NEXT: ret -; RV32I-NEXT: .LBB11_2: # Block address taken +; RV32I-NEXT: .LBB14_2: # Block address taken ; RV32I-NEXT: # %fail ; RV32I-NEXT: # Label of block must be emitted ; RV32I-NEXT: li a0, 1 @@ -471,12 +631,12 @@ define i32 @constraint_m_with_callbr_multi_operands(i32 %a) { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lui a1, %hi(eg) ; RV64I-NEXT: #APP -; RV64I-NEXT: sw zero, %lo(eg)(a1); sw zero, %lo(eg)(a1); beqz a0, .LBB11_2 +; RV64I-NEXT: sw zero, %lo(eg)(a1); sw zero, %lo(eg)(a1); beqz a0, .LBB14_2 ; RV64I-NEXT: #NO_APP ; RV64I-NEXT: # %bb.1: # %normal ; RV64I-NEXT: li a0, 0 ; RV64I-NEXT: ret -; RV64I-NEXT: .LBB11_2: # Block address taken +; RV64I-NEXT: .LBB14_2: # Block address taken ; RV64I-NEXT: # %fail ; RV64I-NEXT: # Label of block must be emitted ; RV64I-NEXT: li a0, 1 @@ -484,16 +644,16 @@ define i32 @constraint_m_with_callbr_multi_operands(i32 %a) { ; ; RV32I-MEDIUM-LABEL: constraint_m_with_callbr_multi_operands: ; RV32I-MEDIUM: # %bb.0: # %entry -; RV32I-MEDIUM-NEXT: .Lpcrel_hi8: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi11: ; RV32I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi8) +; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi11) ; RV32I-MEDIUM-NEXT: #APP -; RV32I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB11_2 +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB14_2 ; RV32I-MEDIUM-NEXT: #NO_APP ; RV32I-MEDIUM-NEXT: # %bb.1: # %normal ; RV32I-MEDIUM-NEXT: li a0, 0 ; RV32I-MEDIUM-NEXT: ret -; RV32I-MEDIUM-NEXT: .LBB11_2: # Block address taken +; RV32I-MEDIUM-NEXT: .LBB14_2: # Block address taken ; RV32I-MEDIUM-NEXT: # %fail ; RV32I-MEDIUM-NEXT: # Label of block must be emitted ; RV32I-MEDIUM-NEXT: li a0, 1 @@ -501,16 +661,16 @@ define i32 @constraint_m_with_callbr_multi_operands(i32 %a) { ; ; RV64I-MEDIUM-LABEL: constraint_m_with_callbr_multi_operands: ; RV64I-MEDIUM: # %bb.0: # %entry -; RV64I-MEDIUM-NEXT: .Lpcrel_hi8: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi11: ; RV64I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi8) +; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi11) ; RV64I-MEDIUM-NEXT: #APP -; RV64I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB11_2 +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB14_2 ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: # %bb.1: # %normal ; RV64I-MEDIUM-NEXT: li a0, 0 ; RV64I-MEDIUM-NEXT: ret -; RV64I-MEDIUM-NEXT: .LBB11_2: # Block address taken +; RV64I-MEDIUM-NEXT: .LBB14_2: # Block address taken ; RV64I-MEDIUM-NEXT: # %fail ; RV64I-MEDIUM-NEXT: # Label of block must be emitted ; RV64I-MEDIUM-NEXT: li a0, 1 @@ -530,16 +690,16 @@ define i32 @constraint_m_with_multi_callbr_asm(i32 %a) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, %hi(eg) ; RV32I-NEXT: #APP -; RV32I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB12_3 +; RV32I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB15_3 ; RV32I-NEXT: #NO_APP ; RV32I-NEXT: # %bb.1: # %normal0 ; RV32I-NEXT: #APP -; RV32I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB12_3 +; RV32I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB15_3 ; RV32I-NEXT: #NO_APP ; RV32I-NEXT: # %bb.2: # %normal1 ; RV32I-NEXT: li a0, 0 ; RV32I-NEXT: ret -; RV32I-NEXT: .LBB12_3: # Block address taken +; RV32I-NEXT: .LBB15_3: # Block address taken ; RV32I-NEXT: # %fail ; RV32I-NEXT: # Label of block must be emitted ; RV32I-NEXT: li a0, 1 @@ -549,16 +709,16 @@ define i32 @constraint_m_with_multi_callbr_asm(i32 %a) { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lui a1, %hi(eg) ; RV64I-NEXT: #APP -; RV64I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB12_3 +; RV64I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB15_3 ; RV64I-NEXT: #NO_APP ; RV64I-NEXT: # %bb.1: # %normal0 ; RV64I-NEXT: #APP -; RV64I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB12_3 +; RV64I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB15_3 ; RV64I-NEXT: #NO_APP ; RV64I-NEXT: # %bb.2: # %normal1 ; RV64I-NEXT: li a0, 0 ; RV64I-NEXT: ret -; RV64I-NEXT: .LBB12_3: # Block address taken +; RV64I-NEXT: .LBB15_3: # Block address taken ; RV64I-NEXT: # %fail ; RV64I-NEXT: # Label of block must be emitted ; RV64I-NEXT: li a0, 1 @@ -566,20 +726,20 @@ define i32 @constraint_m_with_multi_callbr_asm(i32 %a) { ; ; RV32I-MEDIUM-LABEL: constraint_m_with_multi_callbr_asm: ; RV32I-MEDIUM: # %bb.0: # %entry -; RV32I-MEDIUM-NEXT: .Lpcrel_hi9: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi12: ; RV32I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi9) +; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi12) ; RV32I-MEDIUM-NEXT: #APP -; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB12_3 +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB15_3 ; RV32I-MEDIUM-NEXT: #NO_APP ; RV32I-MEDIUM-NEXT: # %bb.1: # %normal0 ; RV32I-MEDIUM-NEXT: #APP -; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB12_3 +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB15_3 ; RV32I-MEDIUM-NEXT: #NO_APP ; RV32I-MEDIUM-NEXT: # %bb.2: # %normal1 ; RV32I-MEDIUM-NEXT: li a0, 0 ; RV32I-MEDIUM-NEXT: ret -; RV32I-MEDIUM-NEXT: .LBB12_3: # Block address taken +; RV32I-MEDIUM-NEXT: .LBB15_3: # Block address taken ; RV32I-MEDIUM-NEXT: # %fail ; RV32I-MEDIUM-NEXT: # Label of block must be emitted ; RV32I-MEDIUM-NEXT: li a0, 1 @@ -587,20 +747,20 @@ define i32 @constraint_m_with_multi_callbr_asm(i32 %a) { ; ; RV64I-MEDIUM-LABEL: constraint_m_with_multi_callbr_asm: ; RV64I-MEDIUM: # %bb.0: # %entry -; RV64I-MEDIUM-NEXT: .Lpcrel_hi9: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi12: ; RV64I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi9) +; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi12) ; RV64I-MEDIUM-NEXT: #APP -; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB12_3 +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB15_3 ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: # %bb.1: # %normal0 ; RV64I-MEDIUM-NEXT: #APP -; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB12_3 +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB15_3 ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: # %bb.2: # %normal1 ; RV64I-MEDIUM-NEXT: li a0, 0 ; RV64I-MEDIUM-NEXT: ret -; RV64I-MEDIUM-NEXT: .LBB12_3: # Block address taken +; RV64I-MEDIUM-NEXT: .LBB15_3: # Block address taken ; RV64I-MEDIUM-NEXT: # %fail ; RV64I-MEDIUM-NEXT: # Label of block must be emitted ; RV64I-MEDIUM-NEXT: li a0, 1 @@ -730,9 +890,9 @@ define void @constraint_o_with_global_1() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_global_1: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi10: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi13: ; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi10) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi13) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 0(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -740,9 +900,9 @@ define void @constraint_o_with_global_1() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_global_1: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi10: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi13: ; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi10) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi13) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP @@ -772,9 +932,9 @@ define void @constraint_o_with_global_2() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_global_2: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi11: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi14: ; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi11) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi14) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 4(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -782,14 +942,14 @@ define void @constraint_o_with_global_2() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_global_2: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi11: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi14: ; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi11) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi14) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 4(a0) ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: ret - call void asm "sw zero, $0", "=*o"(ptr nonnull elementtype(i32) getelementptr inbounds ([400000 x i32], ptr @eg, i32 0, i32 1)) + call void asm "sw zero, $0", "=*o"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 1)) ret void } @@ -814,9 +974,9 @@ define void @constraint_o_with_global_3() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_global_3: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi12: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi15: ; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg+8000) -; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi12) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi15) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 0(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -824,14 +984,14 @@ define void @constraint_o_with_global_3() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_global_3: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi12: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi15: ; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg+8000) -; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi12) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi15) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: ret - call void asm "sw zero, $0", "=*o"(ptr nonnull elementtype(i32) getelementptr inbounds ([400000 x i32], ptr @eg, i32 0, i32 2000)) + call void asm "sw zero, $0", "=*o"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 2000)) ret void } @@ -854,9 +1014,9 @@ define void @constraint_o_with_extern_weak_global_1() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_extern_weak_global_1: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi13: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi16: ; RV32I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) -; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi13)(a0) +; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi16)(a0) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 0(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -864,9 +1024,9 @@ define void @constraint_o_with_extern_weak_global_1() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_extern_weak_global_1: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi13: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi16: ; RV64I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) -; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi13)(a0) +; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi16)(a0) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP @@ -896,9 +1056,9 @@ define void @constraint_o_with_extern_weak_global_2() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_extern_weak_global_2: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi14: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi17: ; RV32I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) -; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi14)(a0) +; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi17)(a0) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 4(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -906,14 +1066,14 @@ define void @constraint_o_with_extern_weak_global_2() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_extern_weak_global_2: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi14: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi17: ; RV64I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) -; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi14)(a0) +; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi17)(a0) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 4(a0) ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: ret - call void asm "sw zero, $0", "=*o"(ptr nonnull elementtype(i32) getelementptr inbounds ([400000 x i32], ptr @ewg, i32 0, i32 1)) + call void asm "sw zero, $0", "=*o"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @ewg, i32 0, i32 1)) ret void } @@ -938,9 +1098,9 @@ define void @constraint_o_with_extern_weak_global_3() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_extern_weak_global_3: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi15: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi18: ; RV32I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) -; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi15)(a0) +; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi18)(a0) ; RV32I-MEDIUM-NEXT: lui a1, 2 ; RV32I-MEDIUM-NEXT: addi a1, a1, -192 ; RV32I-MEDIUM-NEXT: add a0, a0, a1 @@ -951,9 +1111,9 @@ define void @constraint_o_with_extern_weak_global_3() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_extern_weak_global_3: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi15: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi18: ; RV64I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) -; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi15)(a0) +; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi18)(a0) ; RV64I-MEDIUM-NEXT: lui a1, 2 ; RV64I-MEDIUM-NEXT: addiw a1, a1, -192 ; RV64I-MEDIUM-NEXT: add a0, a0, a1 @@ -961,7 +1121,7 @@ define void @constraint_o_with_extern_weak_global_3() nounwind { ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: ret - call void asm "sw zero, $0", "=*o"(ptr nonnull elementtype(i32) getelementptr inbounds ([400000 x i32], ptr @ewg, i32 0, i32 2000)) + call void asm "sw zero, $0", "=*o"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @ewg, i32 0, i32 2000)) ret void } @@ -984,9 +1144,9 @@ define void @constraint_o_with_multi_operands() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_multi_operands: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi16: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi19: ; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi16) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi19) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 0(a0) \n sw zero, 0(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -994,9 +1154,9 @@ define void @constraint_o_with_multi_operands() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_multi_operands: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi16: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi19: ; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi16) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi19) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) \n sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP @@ -1030,9 +1190,9 @@ define void @constraint_o_with_multi_asm() nounwind { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_multi_asm: ; RV32I-MEDIUM: # %bb.0: -; RV32I-MEDIUM-NEXT: .Lpcrel_hi17: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi20: ; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi17) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi20) ; RV32I-MEDIUM-NEXT: #APP ; RV32I-MEDIUM-NEXT: sw zero, 0(a0) ; RV32I-MEDIUM-NEXT: #NO_APP @@ -1043,9 +1203,9 @@ define void @constraint_o_with_multi_asm() nounwind { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_multi_asm: ; RV64I-MEDIUM: # %bb.0: -; RV64I-MEDIUM-NEXT: .Lpcrel_hi17: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi20: ; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi17) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi20) ; RV64I-MEDIUM-NEXT: #APP ; RV64I-MEDIUM-NEXT: sw zero, 0(a0) ; RV64I-MEDIUM-NEXT: #NO_APP @@ -1063,12 +1223,12 @@ define i32 @constraint_o_with_callbr_multi_operands(i32 %a) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, %hi(eg) ; RV32I-NEXT: #APP -; RV32I-NEXT: sw zero, %lo(eg)(a1); sw zero, %lo(eg)(a1); beqz a0, .LBB24_2 +; RV32I-NEXT: sw zero, %lo(eg)(a1); sw zero, %lo(eg)(a1); beqz a0, .LBB27_2 ; RV32I-NEXT: #NO_APP ; RV32I-NEXT: # %bb.1: # %normal ; RV32I-NEXT: li a0, 0 ; RV32I-NEXT: ret -; RV32I-NEXT: .LBB24_2: # Block address taken +; RV32I-NEXT: .LBB27_2: # Block address taken ; RV32I-NEXT: # %fail ; RV32I-NEXT: # Label of block must be emitted ; RV32I-NEXT: li a0, 1 @@ -1078,12 +1238,12 @@ define i32 @constraint_o_with_callbr_multi_operands(i32 %a) { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lui a1, %hi(eg) ; RV64I-NEXT: #APP -; RV64I-NEXT: sw zero, %lo(eg)(a1); sw zero, %lo(eg)(a1); beqz a0, .LBB24_2 +; RV64I-NEXT: sw zero, %lo(eg)(a1); sw zero, %lo(eg)(a1); beqz a0, .LBB27_2 ; RV64I-NEXT: #NO_APP ; RV64I-NEXT: # %bb.1: # %normal ; RV64I-NEXT: li a0, 0 ; RV64I-NEXT: ret -; RV64I-NEXT: .LBB24_2: # Block address taken +; RV64I-NEXT: .LBB27_2: # Block address taken ; RV64I-NEXT: # %fail ; RV64I-NEXT: # Label of block must be emitted ; RV64I-NEXT: li a0, 1 @@ -1091,16 +1251,16 @@ define i32 @constraint_o_with_callbr_multi_operands(i32 %a) { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_callbr_multi_operands: ; RV32I-MEDIUM: # %bb.0: # %entry -; RV32I-MEDIUM-NEXT: .Lpcrel_hi18: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi21: ; RV32I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi18) +; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi21) ; RV32I-MEDIUM-NEXT: #APP -; RV32I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB24_2 +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB27_2 ; RV32I-MEDIUM-NEXT: #NO_APP ; RV32I-MEDIUM-NEXT: # %bb.1: # %normal ; RV32I-MEDIUM-NEXT: li a0, 0 ; RV32I-MEDIUM-NEXT: ret -; RV32I-MEDIUM-NEXT: .LBB24_2: # Block address taken +; RV32I-MEDIUM-NEXT: .LBB27_2: # Block address taken ; RV32I-MEDIUM-NEXT: # %fail ; RV32I-MEDIUM-NEXT: # Label of block must be emitted ; RV32I-MEDIUM-NEXT: li a0, 1 @@ -1108,16 +1268,16 @@ define i32 @constraint_o_with_callbr_multi_operands(i32 %a) { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_callbr_multi_operands: ; RV64I-MEDIUM: # %bb.0: # %entry -; RV64I-MEDIUM-NEXT: .Lpcrel_hi18: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi21: ; RV64I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi18) +; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi21) ; RV64I-MEDIUM-NEXT: #APP -; RV64I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB24_2 +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB27_2 ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: # %bb.1: # %normal ; RV64I-MEDIUM-NEXT: li a0, 0 ; RV64I-MEDIUM-NEXT: ret -; RV64I-MEDIUM-NEXT: .LBB24_2: # Block address taken +; RV64I-MEDIUM-NEXT: .LBB27_2: # Block address taken ; RV64I-MEDIUM-NEXT: # %fail ; RV64I-MEDIUM-NEXT: # Label of block must be emitted ; RV64I-MEDIUM-NEXT: li a0, 1 @@ -1137,16 +1297,16 @@ define i32 @constraint_o_with_multi_callbr_asm(i32 %a) { ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lui a1, %hi(eg) ; RV32I-NEXT: #APP -; RV32I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB25_3 +; RV32I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB28_3 ; RV32I-NEXT: #NO_APP ; RV32I-NEXT: # %bb.1: # %normal0 ; RV32I-NEXT: #APP -; RV32I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB25_3 +; RV32I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB28_3 ; RV32I-NEXT: #NO_APP ; RV32I-NEXT: # %bb.2: # %normal1 ; RV32I-NEXT: li a0, 0 ; RV32I-NEXT: ret -; RV32I-NEXT: .LBB25_3: # Block address taken +; RV32I-NEXT: .LBB28_3: # Block address taken ; RV32I-NEXT: # %fail ; RV32I-NEXT: # Label of block must be emitted ; RV32I-NEXT: li a0, 1 @@ -1156,16 +1316,16 @@ define i32 @constraint_o_with_multi_callbr_asm(i32 %a) { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lui a1, %hi(eg) ; RV64I-NEXT: #APP -; RV64I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB25_3 +; RV64I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB28_3 ; RV64I-NEXT: #NO_APP ; RV64I-NEXT: # %bb.1: # %normal0 ; RV64I-NEXT: #APP -; RV64I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB25_3 +; RV64I-NEXT: sw zero, %lo(eg)(a1); beqz a0, .LBB28_3 ; RV64I-NEXT: #NO_APP ; RV64I-NEXT: # %bb.2: # %normal1 ; RV64I-NEXT: li a0, 0 ; RV64I-NEXT: ret -; RV64I-NEXT: .LBB25_3: # Block address taken +; RV64I-NEXT: .LBB28_3: # Block address taken ; RV64I-NEXT: # %fail ; RV64I-NEXT: # Label of block must be emitted ; RV64I-NEXT: li a0, 1 @@ -1173,20 +1333,20 @@ define i32 @constraint_o_with_multi_callbr_asm(i32 %a) { ; ; RV32I-MEDIUM-LABEL: constraint_o_with_multi_callbr_asm: ; RV32I-MEDIUM: # %bb.0: # %entry -; RV32I-MEDIUM-NEXT: .Lpcrel_hi19: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi22: ; RV32I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) -; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi19) +; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi22) ; RV32I-MEDIUM-NEXT: #APP -; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB25_3 +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB28_3 ; RV32I-MEDIUM-NEXT: #NO_APP ; RV32I-MEDIUM-NEXT: # %bb.1: # %normal0 ; RV32I-MEDIUM-NEXT: #APP -; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB25_3 +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB28_3 ; RV32I-MEDIUM-NEXT: #NO_APP ; RV32I-MEDIUM-NEXT: # %bb.2: # %normal1 ; RV32I-MEDIUM-NEXT: li a0, 0 ; RV32I-MEDIUM-NEXT: ret -; RV32I-MEDIUM-NEXT: .LBB25_3: # Block address taken +; RV32I-MEDIUM-NEXT: .LBB28_3: # Block address taken ; RV32I-MEDIUM-NEXT: # %fail ; RV32I-MEDIUM-NEXT: # Label of block must be emitted ; RV32I-MEDIUM-NEXT: li a0, 1 @@ -1194,20 +1354,20 @@ define i32 @constraint_o_with_multi_callbr_asm(i32 %a) { ; ; RV64I-MEDIUM-LABEL: constraint_o_with_multi_callbr_asm: ; RV64I-MEDIUM: # %bb.0: # %entry -; RV64I-MEDIUM-NEXT: .Lpcrel_hi19: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi22: ; RV64I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) -; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi19) +; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi22) ; RV64I-MEDIUM-NEXT: #APP -; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB25_3 +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB28_3 ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: # %bb.1: # %normal0 ; RV64I-MEDIUM-NEXT: #APP -; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB25_3 +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB28_3 ; RV64I-MEDIUM-NEXT: #NO_APP ; RV64I-MEDIUM-NEXT: # %bb.2: # %normal1 ; RV64I-MEDIUM-NEXT: li a0, 0 ; RV64I-MEDIUM-NEXT: ret -; RV64I-MEDIUM-NEXT: .LBB25_3: # Block address taken +; RV64I-MEDIUM-NEXT: .LBB28_3: # Block address taken ; RV64I-MEDIUM-NEXT: # %fail ; RV64I-MEDIUM-NEXT: # Label of block must be emitted ; RV64I-MEDIUM-NEXT: li a0, 1 @@ -1225,6 +1385,166 @@ fail: ret i32 1 } +define void @constraint_o_with_local_1() nounwind { +; RV32I-LABEL: constraint_o_with_local_1: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp3: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp3) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, %lo(.Ltmp3)(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_o_with_local_1: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp3: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp3) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, %lo(.Ltmp3)(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_o_with_local_1: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp3: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi23: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp3) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi23) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_o_with_local_1: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp3: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi23: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp3) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi23) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + tail call void asm sideeffect "lw zero, $0", "*o"(ptr elementtype(ptr) blockaddress(@constraint_o_with_local_1, %label)) + ret void +} + +define void @constraint_o_with_local_2() nounwind { +; RV32I-LABEL: constraint_o_with_local_2: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp4: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp4) +; RV32I-NEXT: addi a0, a0, %lo(.Ltmp4) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, 4(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_o_with_local_2: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp4: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp4) +; RV64I-NEXT: addi a0, a0, %lo(.Ltmp4) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, 4(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_o_with_local_2: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp4: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi24: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp4) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi24) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 4(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_o_with_local_2: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp4: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi24: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp4) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi24) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 4(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + call void asm "lw zero, $0", "*o"(ptr elementtype(i32) getelementptr (i8, ptr blockaddress(@constraint_o_with_local_2, %label), i32 4)) + ret void +} + +define void @constraint_o_with_local_3() nounwind { +; RV32I-LABEL: constraint_o_with_local_3: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp5: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp5) +; RV32I-NEXT: addi a0, a0, %lo(.Ltmp5) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, 2000(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_o_with_local_3: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp5: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp5) +; RV64I-NEXT: addi a0, a0, %lo(.Ltmp5) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, 2000(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_o_with_local_3: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp5: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi25: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp5) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi25) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 2000(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_o_with_local_3: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp5: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi25: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp5) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi25) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 2000(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + call void asm "lw zero, $0", "*o"(ptr elementtype(i32) getelementptr (i8, ptr blockaddress(@constraint_o_with_local_3, %label), i32 2000)) + ret void +} + define void @constraint_A(ptr %a) nounwind { ; RV32I-LABEL: constraint_A: ; RV32I: # %bb.0: @@ -1306,3 +1626,693 @@ define i32 @constraint_A_with_offset(ptr %a) nounwind { %2 = tail call i32 asm "lw $0, $1", "=r,*A"(ptr elementtype(i32) %1) ret i32 %2 } + +define void @constraint_A_with_global_1() nounwind { +; RV32I-LABEL: constraint_A_with_global_1: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a0, %hi(eg) +; RV32I-NEXT: addi a0, a0, %lo(eg) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_global_1: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a0, %hi(eg) +; RV64I-NEXT: addi a0, a0, %lo(eg) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_global_1: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi26: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi26) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_global_1: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi26: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi26) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret + call void asm "sw zero, $0", "=*A"(ptr elementtype(i32) @eg) + ret void +} + +define void @constraint_A_with_global_2() nounwind { +; RV32I-LABEL: constraint_A_with_global_2: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a0, %hi(eg+4) +; RV32I-NEXT: addi a0, a0, %lo(eg+4) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_global_2: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a0, %hi(eg+4) +; RV64I-NEXT: addi a0, a0, %lo(eg+4) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_global_2: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi27: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg+4) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi27) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_global_2: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi27: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg+4) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi27) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret + call void asm "sw zero, $0", "=*A"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 1)) + ret void +} + +define void @constraint_A_with_global_3() nounwind { +; RV32I-LABEL: constraint_A_with_global_3: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a0, %hi(eg+8000) +; RV32I-NEXT: addi a0, a0, %lo(eg+8000) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_global_3: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a0, %hi(eg+8000) +; RV64I-NEXT: addi a0, a0, %lo(eg+8000) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_global_3: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi28: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg+8000) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi28) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_global_3: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi28: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg+8000) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi28) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret + call void asm "sw zero, $0", "=*A"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 2000)) + ret void +} + +define void @constraint_A_with_extern_weak_global_1() nounwind { +; RV32I-LABEL: constraint_A_with_extern_weak_global_1: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a0, %hi(ewg) +; RV32I-NEXT: addi a0, a0, %lo(ewg) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_extern_weak_global_1: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a0, %hi(ewg) +; RV64I-NEXT: addi a0, a0, %lo(ewg) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_extern_weak_global_1: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi29: +; RV32I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) +; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi29)(a0) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_extern_weak_global_1: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi29: +; RV64I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) +; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi29)(a0) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret + call void asm "sw zero, $0", "=*A"(ptr elementtype(i32) @ewg) + ret void +} + +define void @constraint_A_with_extern_weak_global_2() nounwind { +; RV32I-LABEL: constraint_A_with_extern_weak_global_2: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a0, %hi(ewg+4) +; RV32I-NEXT: addi a0, a0, %lo(ewg+4) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_extern_weak_global_2: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a0, %hi(ewg+4) +; RV64I-NEXT: addi a0, a0, %lo(ewg+4) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_extern_weak_global_2: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi30: +; RV32I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) +; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi30)(a0) +; RV32I-MEDIUM-NEXT: addi a0, a0, 4 +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_extern_weak_global_2: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi30: +; RV64I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) +; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi30)(a0) +; RV64I-MEDIUM-NEXT: addi a0, a0, 4 +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret + call void asm "sw zero, $0", "=*A"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @ewg, i32 0, i32 1)) + ret void +} + +define void @constraint_A_with_extern_weak_global_3() nounwind { +; RV32I-LABEL: constraint_A_with_extern_weak_global_3: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a0, %hi(ewg+8000) +; RV32I-NEXT: addi a0, a0, %lo(ewg+8000) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_extern_weak_global_3: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a0, %hi(ewg+8000) +; RV64I-NEXT: addi a0, a0, %lo(ewg+8000) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_extern_weak_global_3: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi31: +; RV32I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) +; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi31)(a0) +; RV32I-MEDIUM-NEXT: lui a1, 2 +; RV32I-MEDIUM-NEXT: addi a1, a1, -192 +; RV32I-MEDIUM-NEXT: add a0, a0, a1 +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_extern_weak_global_3: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi31: +; RV64I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(ewg) +; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi31)(a0) +; RV64I-MEDIUM-NEXT: lui a1, 2 +; RV64I-MEDIUM-NEXT: addiw a1, a1, -192 +; RV64I-MEDIUM-NEXT: add a0, a0, a1 +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret + call void asm "sw zero, $0", "=*A"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @ewg, i32 0, i32 2000)) + ret void +} + +define void @constraint_A_with_multi_operands() nounwind { +; RV32I-LABEL: constraint_A_with_multi_operands: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a0, %hi(eg) +; RV32I-NEXT: addi a0, a0, %lo(eg) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) \n sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_multi_operands: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a0, %hi(eg) +; RV64I-NEXT: addi a0, a0, %lo(eg) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) \n sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_multi_operands: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi32: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi32) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) \n sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_multi_operands: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi32: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi32) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) \n sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret + call void asm "sw zero, $0 \n sw zero, $1", "=*A,=*A"(ptr elementtype(i32) @eg, ptr elementtype(i32) @eg) + ret void +} + +define void @constraint_A_with_multi_asm() nounwind { +; RV32I-LABEL: constraint_A_with_multi_asm: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a0, %hi(eg) +; RV32I-NEXT: addi a0, a0, %lo(eg) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_multi_asm: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a0, %hi(eg) +; RV64I-NEXT: addi a0, a0, %lo(eg) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_multi_asm: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi33: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi33) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_multi_asm: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi33: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(eg) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi33) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret + call void asm "sw zero, $0", "=*A"(ptr elementtype(i32) @eg) + call void asm "sw zero, $0", "=*A"(ptr elementtype(i32) @eg) + ret void +} + +define i32 @constraint_A_with_callbr_multi_operands(i32 %a) { +; RV32I-LABEL: constraint_A_with_callbr_multi_operands: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, %hi(eg) +; RV32I-NEXT: addi a1, a1, %lo(eg) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB42_2 +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: # %bb.1: # %normal +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB42_2: # Block address taken +; RV32I-NEXT: # %fail +; RV32I-NEXT: # Label of block must be emitted +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_callbr_multi_operands: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: lui a1, %hi(eg) +; RV64I-NEXT: addi a1, a1, %lo(eg) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB42_2 +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: # %bb.1: # %normal +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB42_2: # Block address taken +; RV64I-NEXT: # %fail +; RV64I-NEXT: # Label of block must be emitted +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_callbr_multi_operands: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Lpcrel_hi34: +; RV32I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) +; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi34) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB42_2 +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: # %bb.1: # %normal +; RV32I-MEDIUM-NEXT: li a0, 0 +; RV32I-MEDIUM-NEXT: ret +; RV32I-MEDIUM-NEXT: .LBB42_2: # Block address taken +; RV32I-MEDIUM-NEXT: # %fail +; RV32I-MEDIUM-NEXT: # Label of block must be emitted +; RV32I-MEDIUM-NEXT: li a0, 1 +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_callbr_multi_operands: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Lpcrel_hi34: +; RV64I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) +; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi34) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB42_2 +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: # %bb.1: # %normal +; RV64I-MEDIUM-NEXT: li a0, 0 +; RV64I-MEDIUM-NEXT: ret +; RV64I-MEDIUM-NEXT: .LBB42_2: # Block address taken +; RV64I-MEDIUM-NEXT: # %fail +; RV64I-MEDIUM-NEXT: # Label of block must be emitted +; RV64I-MEDIUM-NEXT: li a0, 1 +; RV64I-MEDIUM-NEXT: ret +entry: + callbr void asm "sw zero, $0; sw zero, $1; beqz $2, $3", "*A,*A,r,!i"(ptr elementtype(i32) @eg, ptr elementtype(i32) @eg, i32 %a) to label %normal [label %fail] + +normal: + ret i32 0 + +fail: + ret i32 1 +} + +define i32 @constraint_A_with_multi_callbr_asm(i32 %a) { +; RV32I-LABEL: constraint_A_with_multi_callbr_asm: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, %hi(eg) +; RV32I-NEXT: addi a1, a1, %lo(eg) +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a1); beqz a0, .LBB43_3 +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: # %bb.1: # %normal0 +; RV32I-NEXT: #APP +; RV32I-NEXT: sw zero, 0(a1); beqz a0, .LBB43_3 +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: # %bb.2: # %normal1 +; RV32I-NEXT: li a0, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB43_3: # Block address taken +; RV32I-NEXT: # %fail +; RV32I-NEXT: # Label of block must be emitted +; RV32I-NEXT: li a0, 1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_multi_callbr_asm: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: lui a1, %hi(eg) +; RV64I-NEXT: addi a1, a1, %lo(eg) +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a1); beqz a0, .LBB43_3 +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: # %bb.1: # %normal0 +; RV64I-NEXT: #APP +; RV64I-NEXT: sw zero, 0(a1); beqz a0, .LBB43_3 +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: # %bb.2: # %normal1 +; RV64I-NEXT: li a0, 0 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB43_3: # Block address taken +; RV64I-NEXT: # %fail +; RV64I-NEXT: # Label of block must be emitted +; RV64I-NEXT: li a0, 1 +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_multi_callbr_asm: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Lpcrel_hi35: +; RV32I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) +; RV32I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi35) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB43_3 +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: # %bb.1: # %normal0 +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB43_3 +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: # %bb.2: # %normal1 +; RV32I-MEDIUM-NEXT: li a0, 0 +; RV32I-MEDIUM-NEXT: ret +; RV32I-MEDIUM-NEXT: .LBB43_3: # Block address taken +; RV32I-MEDIUM-NEXT: # %fail +; RV32I-MEDIUM-NEXT: # Label of block must be emitted +; RV32I-MEDIUM-NEXT: li a0, 1 +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_multi_callbr_asm: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Lpcrel_hi35: +; RV64I-MEDIUM-NEXT: auipc a1, %pcrel_hi(eg) +; RV64I-MEDIUM-NEXT: addi a1, a1, %pcrel_lo(.Lpcrel_hi35) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB43_3 +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: # %bb.1: # %normal0 +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: sw zero, 0(a1); beqz a0, .LBB43_3 +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: # %bb.2: # %normal1 +; RV64I-MEDIUM-NEXT: li a0, 0 +; RV64I-MEDIUM-NEXT: ret +; RV64I-MEDIUM-NEXT: .LBB43_3: # Block address taken +; RV64I-MEDIUM-NEXT: # %fail +; RV64I-MEDIUM-NEXT: # Label of block must be emitted +; RV64I-MEDIUM-NEXT: li a0, 1 +; RV64I-MEDIUM-NEXT: ret +entry: + callbr void asm "sw zero, $0; beqz $1, $2", "=*A,r,!i"(ptr elementtype(i32) @eg, i32 %a) to label %normal0 [label %fail] + +normal0: + callbr void asm "sw zero, $0; beqz $1, $2", "=*A,r,!i"(ptr elementtype(i32) @eg, i32 %a) to label %normal1 [label %fail] + +normal1: + ret i32 0 + +fail: + ret i32 1 +} + +define void @constraint_A_with_local_1() nounwind { +; RV32I-LABEL: constraint_A_with_local_1: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp6: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp6) +; RV32I-NEXT: addi a0, a0, %lo(.Ltmp6) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_local_1: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp6: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp6) +; RV64I-NEXT: addi a0, a0, %lo(.Ltmp6) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_local_1: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp6: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi36: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp6) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi36) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_local_1: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp6: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi36: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp6) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi36) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + tail call void asm sideeffect "lw zero, $0", "*A"(ptr elementtype(ptr) blockaddress(@constraint_A_with_local_1, %label)) + ret void +} + +define void @constraint_A_with_local_2() nounwind { +; RV32I-LABEL: constraint_A_with_local_2: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp7: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp7+4) +; RV32I-NEXT: addi a0, a0, %lo(.Ltmp7+4) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_local_2: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp7: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp7+4) +; RV64I-NEXT: addi a0, a0, %lo(.Ltmp7+4) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_local_2: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp7: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi37: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp7+4) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi37) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_local_2: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp7: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi37: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp7+4) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi37) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + call void asm "lw zero, $0", "*A"(ptr elementtype(i32) getelementptr (i8, ptr blockaddress(@constraint_A_with_local_2, %label), i32 4)) + ret void +} + +define void @constraint_A_with_local_3() nounwind { +; RV32I-LABEL: constraint_A_with_local_3: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: .Ltmp8: # Block address taken +; RV32I-NEXT: # %bb.1: # %label +; RV32I-NEXT: lui a0, %hi(.Ltmp8+2000) +; RV32I-NEXT: addi a0, a0, %lo(.Ltmp8+2000) +; RV32I-NEXT: #APP +; RV32I-NEXT: lw zero, 0(a0) +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_A_with_local_3: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: .Ltmp8: # Block address taken +; RV64I-NEXT: # %bb.1: # %label +; RV64I-NEXT: lui a0, %hi(.Ltmp8+2000) +; RV64I-NEXT: addi a0, a0, %lo(.Ltmp8+2000) +; RV64I-NEXT: #APP +; RV64I-NEXT: lw zero, 0(a0) +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret +; +; RV32I-MEDIUM-LABEL: constraint_A_with_local_3: +; RV32I-MEDIUM: # %bb.0: # %entry +; RV32I-MEDIUM-NEXT: .Ltmp8: # Block address taken +; RV32I-MEDIUM-NEXT: # %bb.1: # %label +; RV32I-MEDIUM-NEXT: .Lpcrel_hi38: +; RV32I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp8+2000) +; RV32I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi38) +; RV32I-MEDIUM-NEXT: #APP +; RV32I-MEDIUM-NEXT: lw zero, 0(a0) +; RV32I-MEDIUM-NEXT: #NO_APP +; RV32I-MEDIUM-NEXT: ret +; +; RV64I-MEDIUM-LABEL: constraint_A_with_local_3: +; RV64I-MEDIUM: # %bb.0: # %entry +; RV64I-MEDIUM-NEXT: .Ltmp8: # Block address taken +; RV64I-MEDIUM-NEXT: # %bb.1: # %label +; RV64I-MEDIUM-NEXT: .Lpcrel_hi38: +; RV64I-MEDIUM-NEXT: auipc a0, %pcrel_hi(.Ltmp8+2000) +; RV64I-MEDIUM-NEXT: addi a0, a0, %pcrel_lo(.Lpcrel_hi38) +; RV64I-MEDIUM-NEXT: #APP +; RV64I-MEDIUM-NEXT: lw zero, 0(a0) +; RV64I-MEDIUM-NEXT: #NO_APP +; RV64I-MEDIUM-NEXT: ret +entry: + br label %label + +label: + call void asm "lw zero, $0", "*A"(ptr elementtype(i32) getelementptr (i8, ptr blockaddress(@constraint_A_with_local_3, %label), i32 2000)) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/local-stack-slot-allocation.ll b/llvm/test/CodeGen/RISCV/local-stack-slot-allocation.ll index ef64eeb9b1869..40d08513e3cf3 100644 --- a/llvm/test/CodeGen/RISCV/local-stack-slot-allocation.ll +++ b/llvm/test/CodeGen/RISCV/local-stack-slot-allocation.ll @@ -15,8 +15,8 @@ define void @use_frame_base_reg() { ; RV32I-NEXT: lui a0, 24 ; RV32I-NEXT: addi a0, a0, 1704 ; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lbu a1, 4(a0) -; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: lbu zero, 4(a0) +; RV32I-NEXT: lbu zero, 0(a0) ; RV32I-NEXT: lui a0, 24 ; RV32I-NEXT: addi a0, a0, 1712 ; RV32I-NEXT: add sp, sp, a0 @@ -31,8 +31,8 @@ define void @use_frame_base_reg() { ; RV64I-NEXT: lui a0, 24 ; RV64I-NEXT: addiw a0, a0, 1704 ; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: lbu a1, 4(a0) -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: lbu zero, 4(a0) +; RV64I-NEXT: lbu zero, 0(a0) ; RV64I-NEXT: lui a0, 24 ; RV64I-NEXT: addiw a0, a0, 1712 ; RV64I-NEXT: add sp, sp, a0 @@ -57,10 +57,10 @@ define void @load_with_offset() { ; RV32I-NEXT: .cfi_def_cfa_offset 100608 ; RV32I-NEXT: lui a0, 25 ; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lbu a0, -292(a0) +; RV32I-NEXT: lbu zero, -292(a0) ; RV32I-NEXT: lui a0, 24 ; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lbu a0, 1704(a0) +; RV32I-NEXT: lbu zero, 1704(a0) ; RV32I-NEXT: lui a0, 25 ; RV32I-NEXT: addi a0, a0, -1792 ; RV32I-NEXT: add sp, sp, a0 @@ -74,10 +74,10 @@ define void @load_with_offset() { ; RV64I-NEXT: .cfi_def_cfa_offset 100608 ; RV64I-NEXT: lui a0, 25 ; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: lbu a0, -292(a0) +; RV64I-NEXT: lbu zero, -292(a0) ; RV64I-NEXT: lui a0, 24 ; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: lbu a0, 1704(a0) +; RV64I-NEXT: lbu zero, 1704(a0) ; RV64I-NEXT: lui a0, 25 ; RV64I-NEXT: addiw a0, a0, -1792 ; RV64I-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll index f6d69791e5d7c..17167e7f51b4c 100644 --- a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll +++ b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll @@ -16,7 +16,7 @@ define void @test_lla(i32 signext %n) { ; RV32I-NEXT: auipc a2, %pcrel_hi(l) ; RV32I-NEXT: .LBB0_1: # %loop ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: lw a3, %pcrel_lo(.Lpcrel_hi0)(a2) +; RV32I-NEXT: lw zero, %pcrel_lo(.Lpcrel_hi0)(a2) ; RV32I-NEXT: addi a1, a1, 1 ; RV32I-NEXT: blt a1, a0, .LBB0_1 ; RV32I-NEXT: # %bb.2: # %ret @@ -29,7 +29,7 @@ define void @test_lla(i32 signext %n) { ; RV64I-NEXT: auipc a2, %pcrel_hi(l) ; RV64I-NEXT: .LBB0_1: # %loop ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: lw a3, %pcrel_lo(.Lpcrel_hi0)(a2) +; RV64I-NEXT: lw zero, %pcrel_lo(.Lpcrel_hi0)(a2) ; RV64I-NEXT: addiw a1, a1, 1 ; RV64I-NEXT: blt a1, a0, .LBB0_1 ; RV64I-NEXT: # %bb.2: # %ret @@ -59,7 +59,7 @@ define void @test_la(i32 signext %n) { ; RV32I-NEXT: li a2, 0 ; RV32I-NEXT: .LBB1_1: # %loop ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw zero, 0(a1) ; RV32I-NEXT: addi a2, a2, 1 ; RV32I-NEXT: blt a2, a0, .LBB1_1 ; RV32I-NEXT: # %bb.2: # %ret @@ -73,7 +73,7 @@ define void @test_la(i32 signext %n) { ; RV64I-NEXT: li a2, 0 ; RV64I-NEXT: .LBB1_1: # %loop ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: lw a3, 0(a1) +; RV64I-NEXT: lw zero, 0(a1) ; RV64I-NEXT: addiw a2, a2, 1 ; RV64I-NEXT: blt a2, a0, .LBB1_1 ; RV64I-NEXT: # %bb.2: # %ret @@ -104,7 +104,7 @@ define void @test_la_tls_ie(i32 signext %n) { ; RV32I-NEXT: add a2, a2, tp ; RV32I-NEXT: .LBB2_1: # %loop ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: lw a3, 0(a2) +; RV32I-NEXT: lw zero, 0(a2) ; RV32I-NEXT: addi a1, a1, 1 ; RV32I-NEXT: blt a1, a0, .LBB2_1 ; RV32I-NEXT: # %bb.2: # %ret @@ -119,7 +119,7 @@ define void @test_la_tls_ie(i32 signext %n) { ; RV64I-NEXT: add a2, a2, tp ; RV64I-NEXT: .LBB2_1: # %loop ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: lw a3, 0(a2) +; RV64I-NEXT: lw zero, 0(a2) ; RV64I-NEXT: addiw a1, a1, 1 ; RV64I-NEXT: blt a1, a0, .LBB2_1 ; RV64I-NEXT: # %bb.2: # %ret @@ -157,7 +157,7 @@ define void @test_la_tls_gd(i32 signext %n) nounwind { ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __tls_get_addr@plt -; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw zero, 0(a0) ; RV32I-NEXT: addi s2, s2, 1 ; RV32I-NEXT: blt s2, s0, .LBB3_1 ; RV32I-NEXT: # %bb.2: # %ret @@ -184,7 +184,7 @@ define void @test_la_tls_gd(i32 signext %n) nounwind { ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __tls_get_addr@plt -; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: lw zero, 0(a0) ; RV64I-NEXT: addiw s2, s2, 1 ; RV64I-NEXT: blt s2, s0, .LBB3_1 ; RV64I-NEXT: # %bb.2: # %ret diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll index 74874c1ca74b3..3718ce80142d4 100644 --- a/llvm/test/CodeGen/RISCV/mem.ll +++ b/llvm/test/CodeGen/RISCV/mem.ll @@ -8,7 +8,7 @@ define dso_local i32 @lb(ptr %a) nounwind { ; RV32I-LABEL: lb: ; RV32I: # %bb.0: ; RV32I-NEXT: lb a1, 1(a0) -; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: lbu zero, 0(a0) ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: ret %1 = getelementptr i8, ptr %a, i32 1 @@ -23,7 +23,7 @@ define dso_local i32 @lh(ptr %a) nounwind { ; RV32I-LABEL: lh: ; RV32I: # %bb.0: ; RV32I-NEXT: lh a1, 4(a0) -; RV32I-NEXT: lh a0, 0(a0) +; RV32I-NEXT: lh zero, 0(a0) ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 2 @@ -38,7 +38,7 @@ define dso_local i32 @lw(ptr %a) nounwind { ; RV32I-LABEL: lw: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a1, 12(a0) -; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw zero, 0(a0) ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: ret %1 = getelementptr i32, ptr %a, i32 3 @@ -123,7 +123,7 @@ define dso_local i32 @load_sext_zext_anyext_i1(ptr %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a1, 1(a0) ; RV32I-NEXT: lbu a2, 2(a0) -; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: lbu zero, 0(a0) ; RV32I-NEXT: sub a0, a2, a1 ; RV32I-NEXT: ret ; sextload i1 @@ -145,7 +145,7 @@ define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lbu a1, 1(a0) ; RV32I-NEXT: lbu a2, 2(a0) -; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: lbu zero, 0(a0) ; RV32I-NEXT: sub a0, a2, a1 ; RV32I-NEXT: ret ; sextload i1 @@ -172,7 +172,7 @@ define dso_local i32 @lw_sw_global(i32 %a) nounwind { ; RV32I-NEXT: lw a1, %lo(G)(a2) ; RV32I-NEXT: addi a3, a2, %lo(G) ; RV32I-NEXT: sw a0, %lo(G)(a2) -; RV32I-NEXT: lw a2, 36(a3) +; RV32I-NEXT: lw zero, 36(a3) ; RV32I-NEXT: sw a0, 36(a3) ; RV32I-NEXT: mv a0, a1 ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll index 903c5b223b69c..09b04535498c5 100644 --- a/llvm/test/CodeGen/RISCV/mem64.ll +++ b/llvm/test/CodeGen/RISCV/mem64.ll @@ -8,7 +8,7 @@ define dso_local i64 @lb(ptr %a) nounwind { ; RV64I-LABEL: lb: ; RV64I: # %bb.0: ; RV64I-NEXT: lb a1, 1(a0) -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: lbu zero, 0(a0) ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i8, ptr %a, i32 1 @@ -23,7 +23,7 @@ define dso_local i64 @lh(ptr %a) nounwind { ; RV64I-LABEL: lh: ; RV64I: # %bb.0: ; RV64I-NEXT: lh a1, 4(a0) -; RV64I-NEXT: lh a0, 0(a0) +; RV64I-NEXT: lh zero, 0(a0) ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i16, ptr %a, i32 2 @@ -38,7 +38,7 @@ define dso_local i64 @lw(ptr %a) nounwind { ; RV64I-LABEL: lw: ; RV64I: # %bb.0: ; RV64I-NEXT: lw a1, 12(a0) -; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: lw zero, 0(a0) ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i32, ptr %a, i32 3 @@ -141,7 +141,7 @@ define dso_local i64 @ld(ptr %a) nounwind { ; RV64I-LABEL: ld: ; RV64I: # %bb.0: ; RV64I-NEXT: ld a1, 80(a0) -; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: ld zero, 0(a0) ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ret %1 = getelementptr i64, ptr %a, i32 10 @@ -168,7 +168,7 @@ define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: lbu a2, 2(a0) -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: lbu zero, 0(a0) ; RV64I-NEXT: sub a0, a2, a1 ; RV64I-NEXT: ret ; sextload i1 @@ -190,7 +190,7 @@ define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: lbu a2, 2(a0) -; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: lbu zero, 0(a0) ; RV64I-NEXT: sub a0, a2, a1 ; RV64I-NEXT: ret ; sextload i1 @@ -217,7 +217,7 @@ define dso_local i64 @ld_sd_global(i64 %a) nounwind { ; RV64I-NEXT: ld a1, %lo(G)(a2) ; RV64I-NEXT: addi a3, a2, %lo(G) ; RV64I-NEXT: sd a0, %lo(G)(a2) -; RV64I-NEXT: ld a2, 72(a3) +; RV64I-NEXT: ld zero, 72(a3) ; RV64I-NEXT: sd a0, 72(a3) ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index 341db9a1a172a..f2b7e8d26328d 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1252,39 +1252,39 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV32I-LABEL: muli128_m63: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a2, 0(a1) -; RV32I-NEXT: lw a5, 12(a1) -; RV32I-NEXT: lw a7, 8(a1) -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: slli a1, a2, 6 -; RV32I-NEXT: sltu a4, a2, a1 -; RV32I-NEXT: srli a6, a2, 26 -; RV32I-NEXT: slli t0, a3, 6 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: mv t0, a4 -; RV32I-NEXT: beq a3, a6, .LBB31_2 +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: lw a6, 8(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: slli a3, a2, 6 +; RV32I-NEXT: sltu a5, a2, a3 +; RV32I-NEXT: srli a7, a2, 26 +; RV32I-NEXT: slli t0, a1, 6 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: mv t0, a5 +; RV32I-NEXT: beq a1, a7, .LBB31_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sltu t0, a3, a6 +; RV32I-NEXT: sltu t0, a1, a7 ; RV32I-NEXT: .LBB31_2: -; RV32I-NEXT: srli t1, a3, 26 -; RV32I-NEXT: slli t2, a7, 6 +; RV32I-NEXT: srli t1, a1, 26 +; RV32I-NEXT: slli t2, a6, 6 ; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: sub t2, a7, t1 +; RV32I-NEXT: sub t2, a6, t1 ; RV32I-NEXT: sltu t3, t2, t0 -; RV32I-NEXT: sltu t1, a7, t1 -; RV32I-NEXT: srli a7, a7, 26 -; RV32I-NEXT: slli t4, a5, 6 -; RV32I-NEXT: or a7, t4, a7 -; RV32I-NEXT: sub a5, a5, a7 -; RV32I-NEXT: sub a5, a5, t1 -; RV32I-NEXT: sub a5, a5, t3 -; RV32I-NEXT: sub a7, t2, t0 -; RV32I-NEXT: sub a3, a3, a6 -; RV32I-NEXT: sub a3, a3, a4 -; RV32I-NEXT: sub a2, a2, a1 +; RV32I-NEXT: sltu t1, a6, t1 +; RV32I-NEXT: srli a6, a6, 26 +; RV32I-NEXT: slli t4, a4, 6 +; RV32I-NEXT: or a6, t4, a6 +; RV32I-NEXT: sub a4, a4, a6 +; RV32I-NEXT: sub a4, a4, t1 +; RV32I-NEXT: sub a4, a4, t3 +; RV32I-NEXT: sub a6, t2, t0 +; RV32I-NEXT: sub a1, a1, a7 +; RV32I-NEXT: sub a1, a1, a5 +; RV32I-NEXT: sub a2, a2, a3 ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a7, 8(a0) -; RV32I-NEXT: sw a5, 12(a0) +; RV32I-NEXT: sw a1, 4(a0) +; RV32I-NEXT: sw a6, 8(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli128_m63: diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index ffbecc5074d3a..42f998e68bb6e 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -1074,7 +1074,7 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s4, a5 +; RV32-NEXT: mv s2, a5 ; RV32-NEXT: andi a5, a5, 1 ; RV32-NEXT: beqz a5, .LBB32_8 ; RV32-NEXT: # %bb.1: # %t @@ -1082,19 +1082,19 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: mv s3, a3 ; RV32-NEXT: mv s1, a2 ; RV32-NEXT: mv s5, a1 -; RV32-NEXT: mv s2, a0 +; RV32-NEXT: mv s4, a0 ; RV32-NEXT: beq a1, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t ; RV32-NEXT: sltu s6, s5, s3 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s2, s1 +; RV32-NEXT: sltu s6, s4, s1 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call@plt ; RV32-NEXT: beqz s6, .LBB32_8 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s2, s1 +; RV32-NEXT: sltu a1, s4, s1 ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: beq s5, s3, .LBB32_7 ; RV32-NEXT: # %bb.6: # %end @@ -1102,12 +1102,12 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .LBB32_7: # %end ; RV32-NEXT: sub a2, s5, s3 ; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sub a1, s2, s1 +; RV32-NEXT: sub a1, s4, s1 ; RV32-NEXT: sw a1, 0(s0) ; RV32-NEXT: sw a2, 4(s0) ; RV32-NEXT: j .LBB32_9 ; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s4 +; RV32-NEXT: mv a0, s2 ; RV32-NEXT: .LBB32_9: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/push-pop-popret.ll b/llvm/test/CodeGen/RISCV/push-pop-popret.ll index af3828ed7d839..84e4062ca333d 100644 --- a/llvm/test/CodeGen/RISCV/push-pop-popret.ll +++ b/llvm/test/CodeGen/RISCV/push-pop-popret.ll @@ -1841,16 +1841,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: sw t4, 44(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: sw t5, 40(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: sw t6, 36(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lui a7, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-NEXT: lui a6, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -1875,7 +1875,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a7, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -1886,7 +1886,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw a7, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -1912,13 +1912,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-NEXT: lw a0, 32(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-NEXT: lw t0, 92(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t1, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: lw t2, 84(sp) # 4-byte Folded Reload @@ -1957,16 +1957,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: sd t4, 72(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: sd t5, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: sd t6, 56(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lui a7, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-NEXT: lui a6, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -1991,7 +1991,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a7, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -2002,7 +2002,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw a7, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -2028,13 +2028,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-NEXT: ld t0, 168(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t1, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: ld t2, 152(sp) # 8-byte Folded Reload @@ -2073,16 +2073,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: sw t4, 44(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: sw t5, 40(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: sw t6, 36(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 32(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32IZCMP-SR-NEXT: lw a0, 16(a5) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2107,7 +2107,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: lw t1, 92(a5) ; RV32IZCMP-SR-NEXT: lw t0, 96(a5) ; RV32IZCMP-SR-NEXT: lw s0, 100(a5) -; RV32IZCMP-SR-NEXT: lw a6, 104(a5) +; RV32IZCMP-SR-NEXT: lw a7, 104(a5) ; RV32IZCMP-SR-NEXT: lw a4, 108(a5) ; RV32IZCMP-SR-NEXT: lw a0, 124(a5) ; RV32IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2118,7 +2118,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: sw a2, 116(a5) ; RV32IZCMP-SR-NEXT: sw a3, 112(a5) ; RV32IZCMP-SR-NEXT: sw a4, 108(a5) -; RV32IZCMP-SR-NEXT: sw a6, 104(a5) +; RV32IZCMP-SR-NEXT: sw a7, 104(a5) ; RV32IZCMP-SR-NEXT: sw s0, 100(a5) ; RV32IZCMP-SR-NEXT: sw t0, 96(a5) ; RV32IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2144,13 +2144,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: sw a0, 16(a5) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 32(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-SR-NEXT: lw t0, 92(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t1, 88(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: lw t2, 84(sp) # 4-byte Folded Reload @@ -2189,16 +2189,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: sd t4, 72(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: sd t5, 64(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: sd t6, 56(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64IZCMP-SR-NEXT: lw a0, 16(a5) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2223,7 +2223,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: lw t1, 92(a5) ; RV64IZCMP-SR-NEXT: lw t0, 96(a5) ; RV64IZCMP-SR-NEXT: lw s0, 100(a5) -; RV64IZCMP-SR-NEXT: lw a6, 104(a5) +; RV64IZCMP-SR-NEXT: lw a7, 104(a5) ; RV64IZCMP-SR-NEXT: lw a4, 108(a5) ; RV64IZCMP-SR-NEXT: lw a0, 124(a5) ; RV64IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2234,7 +2234,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: sw a2, 116(a5) ; RV64IZCMP-SR-NEXT: sw a3, 112(a5) ; RV64IZCMP-SR-NEXT: sw a4, 108(a5) -; RV64IZCMP-SR-NEXT: sw a6, 104(a5) +; RV64IZCMP-SR-NEXT: sw a7, 104(a5) ; RV64IZCMP-SR-NEXT: sw s0, 100(a5) ; RV64IZCMP-SR-NEXT: sw t0, 96(a5) ; RV64IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2260,13 +2260,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: sw a0, 16(a5) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-SR-NEXT: ld t0, 168(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t1, 160(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: ld t2, 152(sp) # 8-byte Folded Reload @@ -2317,16 +2317,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: sw t4, 40(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t5, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw t6, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a7, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: lui a6, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32I-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32I-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -2351,7 +2351,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a7, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -2362,7 +2362,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw a7, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -2388,13 +2388,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32I-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw t1, 132(sp) # 4-byte Folded Reload @@ -2457,16 +2457,16 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: sd t4, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd t5, 56(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd t6, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a7, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: lui a6, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64I-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -2491,7 +2491,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a7, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -2502,7 +2502,7 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw a7, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -2528,13 +2528,13 @@ define void @callee_with_irq() nounwind "interrupt"="user" { ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64I-NEXT: ld ra, 264(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t0, 256(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld t1, 248(sp) # 8-byte Folded Reload @@ -2574,16 +2574,16 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-LABEL: callee_no_irq: ; RV32IZCMP: # %bb.0: ; RV32IZCMP-NEXT: cm.push {ra, s0-s11}, -96 -; RV32IZCMP-NEXT: lui a7, %hi(var_test_irq) -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-NEXT: lui a6, %hi(var_test_irq) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32IZCMP-NEXT: lw a0, 16(a5) ; RV32IZCMP-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-NEXT: lw a0, 20(a5) @@ -2608,7 +2608,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: lw t1, 92(a5) ; RV32IZCMP-NEXT: lw t0, 96(a5) ; RV32IZCMP-NEXT: lw s0, 100(a5) -; RV32IZCMP-NEXT: lw a6, 104(a5) +; RV32IZCMP-NEXT: lw a7, 104(a5) ; RV32IZCMP-NEXT: lw a4, 108(a5) ; RV32IZCMP-NEXT: lw a0, 124(a5) ; RV32IZCMP-NEXT: lw a1, 120(a5) @@ -2619,7 +2619,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: sw a2, 116(a5) ; RV32IZCMP-NEXT: sw a3, 112(a5) ; RV32IZCMP-NEXT: sw a4, 108(a5) -; RV32IZCMP-NEXT: sw a6, 104(a5) +; RV32IZCMP-NEXT: sw a7, 104(a5) ; RV32IZCMP-NEXT: sw s0, 100(a5) ; RV32IZCMP-NEXT: sw t0, 96(a5) ; RV32IZCMP-NEXT: sw t1, 92(a5) @@ -2645,28 +2645,28 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-NEXT: sw a0, 16(a5) ; RV32IZCMP-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-NEXT: cm.popret {ra, s0-s11}, 96 ; ; RV64IZCMP-LABEL: callee_no_irq: ; RV64IZCMP: # %bb.0: ; RV64IZCMP-NEXT: cm.push {ra, s0-s11}, -160 -; RV64IZCMP-NEXT: lui a7, %hi(var_test_irq) -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-NEXT: lui a6, %hi(var_test_irq) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64IZCMP-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64IZCMP-NEXT: lw a0, 16(a5) ; RV64IZCMP-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-NEXT: lw a0, 20(a5) @@ -2691,7 +2691,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: lw t1, 92(a5) ; RV64IZCMP-NEXT: lw t0, 96(a5) ; RV64IZCMP-NEXT: lw s0, 100(a5) -; RV64IZCMP-NEXT: lw a6, 104(a5) +; RV64IZCMP-NEXT: lw a7, 104(a5) ; RV64IZCMP-NEXT: lw a4, 108(a5) ; RV64IZCMP-NEXT: lw a0, 124(a5) ; RV64IZCMP-NEXT: lw a1, 120(a5) @@ -2702,7 +2702,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: sw a2, 116(a5) ; RV64IZCMP-NEXT: sw a3, 112(a5) ; RV64IZCMP-NEXT: sw a4, 108(a5) -; RV64IZCMP-NEXT: sw a6, 104(a5) +; RV64IZCMP-NEXT: sw a7, 104(a5) ; RV64IZCMP-NEXT: sw s0, 100(a5) ; RV64IZCMP-NEXT: sw t0, 96(a5) ; RV64IZCMP-NEXT: sw t1, 92(a5) @@ -2728,29 +2728,29 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-NEXT: sw a0, 16(a5) ; RV64IZCMP-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-NEXT: cm.popret {ra, s0-s11}, 160 ; ; RV32IZCMP-SR-LABEL: callee_no_irq: ; RV32IZCMP-SR: # %bb.0: ; RV32IZCMP-SR-NEXT: call t0, __riscv_save_12 ; RV32IZCMP-SR-NEXT: addi sp, sp, -32 -; RV32IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-SR-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32IZCMP-SR-NEXT: lw a0, 16(a5) ; RV32IZCMP-SR-NEXT: sw a0, 12(sp) # 4-byte Folded Spill ; RV32IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2775,7 +2775,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: lw t1, 92(a5) ; RV32IZCMP-SR-NEXT: lw t0, 96(a5) ; RV32IZCMP-SR-NEXT: lw s0, 100(a5) -; RV32IZCMP-SR-NEXT: lw a6, 104(a5) +; RV32IZCMP-SR-NEXT: lw a7, 104(a5) ; RV32IZCMP-SR-NEXT: lw a4, 108(a5) ; RV32IZCMP-SR-NEXT: lw a0, 124(a5) ; RV32IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2786,7 +2786,7 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: sw a2, 116(a5) ; RV32IZCMP-SR-NEXT: sw a3, 112(a5) ; RV32IZCMP-SR-NEXT: sw a4, 108(a5) -; RV32IZCMP-SR-NEXT: sw a6, 104(a5) +; RV32IZCMP-SR-NEXT: sw a7, 104(a5) ; RV32IZCMP-SR-NEXT: sw s0, 100(a5) ; RV32IZCMP-SR-NEXT: sw t0, 96(a5) ; RV32IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2812,13 +2812,13 @@ define void @callee_no_irq() nounwind{ ; RV32IZCMP-SR-NEXT: lw a0, 12(sp) # 4-byte Folded Reload ; RV32IZCMP-SR-NEXT: sw a0, 16(a5) ; RV32IZCMP-SR-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32IZCMP-SR-NEXT: lw a0, 28(sp) # 4-byte Folded Reload -; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32IZCMP-SR-NEXT: addi sp, sp, 32 ; RV32IZCMP-SR-NEXT: tail __riscv_restore_12 ; @@ -2826,16 +2826,16 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR: # %bb.0: ; RV64IZCMP-SR-NEXT: call t0, __riscv_save_12 ; RV64IZCMP-SR-NEXT: addi sp, sp, -48 -; RV64IZCMP-SR-NEXT: lui a7, %hi(var_test_irq) -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-SR-NEXT: lui a6, %hi(var_test_irq) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-SR-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-SR-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64IZCMP-SR-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64IZCMP-SR-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64IZCMP-SR-NEXT: lw a0, 16(a5) ; RV64IZCMP-SR-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64IZCMP-SR-NEXT: lw a0, 20(a5) @@ -2860,7 +2860,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: lw t1, 92(a5) ; RV64IZCMP-SR-NEXT: lw t0, 96(a5) ; RV64IZCMP-SR-NEXT: lw s0, 100(a5) -; RV64IZCMP-SR-NEXT: lw a6, 104(a5) +; RV64IZCMP-SR-NEXT: lw a7, 104(a5) ; RV64IZCMP-SR-NEXT: lw a4, 108(a5) ; RV64IZCMP-SR-NEXT: lw a0, 124(a5) ; RV64IZCMP-SR-NEXT: lw a1, 120(a5) @@ -2871,7 +2871,7 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: sw a2, 116(a5) ; RV64IZCMP-SR-NEXT: sw a3, 112(a5) ; RV64IZCMP-SR-NEXT: sw a4, 108(a5) -; RV64IZCMP-SR-NEXT: sw a6, 104(a5) +; RV64IZCMP-SR-NEXT: sw a7, 104(a5) ; RV64IZCMP-SR-NEXT: sw s0, 100(a5) ; RV64IZCMP-SR-NEXT: sw t0, 96(a5) ; RV64IZCMP-SR-NEXT: sw t1, 92(a5) @@ -2897,13 +2897,13 @@ define void @callee_no_irq() nounwind{ ; RV64IZCMP-SR-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64IZCMP-SR-NEXT: sw a0, 16(a5) ; RV64IZCMP-SR-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64IZCMP-SR-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64IZCMP-SR-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64IZCMP-SR-NEXT: addi sp, sp, 48 ; RV64IZCMP-SR-NEXT: tail __riscv_restore_12 ; @@ -2923,16 +2923,16 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: sw s9, 36(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 32(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: lui a7, %hi(var_test_irq) -; RV32I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: lui a6, %hi(var_test_irq) +; RV32I-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV32I-NEXT: addi a5, a6, %lo(var_test_irq) ; RV32I-NEXT: lw a0, 16(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: lw a0, 20(a5) @@ -2957,7 +2957,7 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: lw s10, 92(a5) ; RV32I-NEXT: lw s11, 96(a5) ; RV32I-NEXT: lw ra, 100(a5) -; RV32I-NEXT: lw a6, 104(a5) +; RV32I-NEXT: lw a7, 104(a5) ; RV32I-NEXT: lw a4, 108(a5) ; RV32I-NEXT: lw a0, 124(a5) ; RV32I-NEXT: lw a1, 120(a5) @@ -2968,7 +2968,7 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: sw a2, 116(a5) ; RV32I-NEXT: sw a3, 112(a5) ; RV32I-NEXT: sw a4, 108(a5) -; RV32I-NEXT: sw a6, 104(a5) +; RV32I-NEXT: sw a7, 104(a5) ; RV32I-NEXT: sw ra, 100(a5) ; RV32I-NEXT: sw s11, 96(a5) ; RV32I-NEXT: sw s10, 92(a5) @@ -2994,13 +2994,13 @@ define void @callee_no_irq() nounwind{ ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: sw a0, 16(a5) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV32I-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 68(sp) # 4-byte Folded Reload @@ -3033,16 +3033,16 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: sd s9, 72(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: lui a7, %hi(var_test_irq) -; RV64I-NEXT: lw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: lui a6, %hi(var_test_irq) +; RV64I-NEXT: lw a0, %lo(var_test_irq)(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+4)(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+8)(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: lw a0, %lo(var_test_irq+12)(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: addi a5, a7, %lo(var_test_irq) +; RV64I-NEXT: addi a5, a6, %lo(var_test_irq) ; RV64I-NEXT: lw a0, 16(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 20(a5) @@ -3067,7 +3067,7 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: lw s10, 92(a5) ; RV64I-NEXT: lw s11, 96(a5) ; RV64I-NEXT: lw ra, 100(a5) -; RV64I-NEXT: lw a6, 104(a5) +; RV64I-NEXT: lw a7, 104(a5) ; RV64I-NEXT: lw a4, 108(a5) ; RV64I-NEXT: lw a0, 124(a5) ; RV64I-NEXT: lw a1, 120(a5) @@ -3078,7 +3078,7 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: sw a2, 116(a5) ; RV64I-NEXT: sw a3, 112(a5) ; RV64I-NEXT: sw a4, 108(a5) -; RV64I-NEXT: sw a6, 104(a5) +; RV64I-NEXT: sw a7, 104(a5) ; RV64I-NEXT: sw ra, 100(a5) ; RV64I-NEXT: sw s11, 96(a5) ; RV64I-NEXT: sw s10, 92(a5) @@ -3104,13 +3104,13 @@ define void @callee_no_irq() nounwind{ ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: sw a0, 16(a5) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+12)(a6) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+8)(a6) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq+4)(a6) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sw a0, %lo(var_test_irq)(a7) +; RV64I-NEXT: sw a0, %lo(var_test_irq)(a6) ; RV64I-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 144(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 136(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll index 0479cb223907f..71040bf2646d2 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll @@ -138,25 +138,25 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: rol_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: slli a5, a2, 26 -; CHECK-NEXT: srli a5, a5, 31 +; CHECK-NEXT: slli a3, a2, 26 +; CHECK-NEXT: srli a3, a3, 31 ; CHECK-NEXT: mv a4, a1 -; CHECK-NEXT: bnez a5, .LBB7_2 +; CHECK-NEXT: bnez a3, .LBB7_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: .LBB7_2: -; CHECK-NEXT: sll a3, a4, a2 -; CHECK-NEXT: bnez a5, .LBB7_4 +; CHECK-NEXT: sll a5, a4, a2 +; CHECK-NEXT: bnez a3, .LBB7_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: srli a1, a0, 1 -; CHECK-NEXT: not a5, a2 -; CHECK-NEXT: srl a1, a1, a5 -; CHECK-NEXT: or a3, a3, a1 +; CHECK-NEXT: not a6, a2 +; CHECK-NEXT: srl a3, a1, a6 +; CHECK-NEXT: or a3, a5, a3 ; CHECK-NEXT: sll a0, a0, a2 ; CHECK-NEXT: srli a4, a4, 1 -; CHECK-NEXT: srl a1, a4, a5 +; CHECK-NEXT: srl a1, a4, a6 ; CHECK-NEXT: or a1, a0, a1 ; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: ret @@ -191,24 +191,24 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: ror_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: andi a5, a2, 32 +; CHECK-NEXT: andi a4, a2, 32 ; CHECK-NEXT: mv a3, a0 -; CHECK-NEXT: beqz a5, .LBB9_2 +; CHECK-NEXT: beqz a4, .LBB9_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a3, a1 ; CHECK-NEXT: .LBB9_2: -; CHECK-NEXT: srl a4, a3, a2 -; CHECK-NEXT: beqz a5, .LBB9_4 +; CHECK-NEXT: srl a5, a3, a2 +; CHECK-NEXT: beqz a4, .LBB9_4 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: .LBB9_4: ; CHECK-NEXT: slli a0, a1, 1 -; CHECK-NEXT: not a5, a2 -; CHECK-NEXT: sll a0, a0, a5 -; CHECK-NEXT: or a0, a0, a4 +; CHECK-NEXT: not a4, a2 +; CHECK-NEXT: sll a0, a0, a4 +; CHECK-NEXT: or a0, a0, a5 ; CHECK-NEXT: srl a1, a1, a2 ; CHECK-NEXT: slli a3, a3, 1 -; CHECK-NEXT: sll a2, a3, a5 +; CHECK-NEXT: sll a2, a3, a4 ; CHECK-NEXT: or a1, a2, a1 ; CHECK-NEXT: ret %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b) diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll index 2cdc07ef140ed..aa3d7b3fa8a7c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll @@ -1512,26 +1512,26 @@ define @bitreverse_nxv8i64( %va) { ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v0, v8, 24 ; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v24, (a3), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vand.vx v0, v0, a3 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vx v0, v8, a2 ; RV32-NEXT: vsll.vx v0, v0, a1 -; RV32-NEXT: vsll.vx v16, v8, a0 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsll.vx v24, v8, a0 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a3 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll index 25bee211fb2b5..690ecc6eab33b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll @@ -607,26 +607,26 @@ define @bswap_nxv8i64( %va) { ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v0, v8, 24 ; RV32-NEXT: addi a3, sp, 8 -; RV32-NEXT: vlse64.v v24, (a3), zero +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: lui a3, 4080 ; RV32-NEXT: vand.vx v0, v0, a3 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vor.vv v16, v16, v0 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vor.vv v24, v24, v0 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vx v0, v8, a2 ; RV32-NEXT: vsll.vx v0, v0, a1 -; RV32-NEXT: vsll.vx v16, v8, a0 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vsll.vx v24, v8, a0 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: vand.vx v8, v8, a3 ; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v24, v8 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll index f68ac2212b527..7ce167f892973 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll @@ -737,14 +737,16 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v0, a2 +; CHECK-NEXT: vslidedown.vx v25, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -752,43 +754,49 @@ define @vp_ceil_vv_nxv16f64( %va, < ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll index 50db8a7592c45..1c003a33c54bf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll @@ -2697,9 +2697,9 @@ define @vp_ctpop_nxv16i64( %va, @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index 4b3cf6181514b..e53877f53833f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -2700,18 +2700,18 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -5841,18 +5841,18 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index 48aa01a0e141e..55485beff8eb1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1916,18 +1916,18 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi sp, sp, -48 ; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v24, v0, 2 +; RV32-NEXT: vslidedown.vi v1, v0, 2 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 44(sp) @@ -1955,59 +1955,43 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a2, sp, 40 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsub.vv v24, v8, v16, v0.t ; RV32-NEXT: addi a2, sp, 32 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 @@ -2016,31 +2000,28 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vadd.vv v16, v16, v8, v0.t ; RV32-NEXT: addi a2, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 5 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 -; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: addi a2, sp, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 +; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -2049,82 +2030,50 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v24 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: addi a0, sp, 48 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v24, v8, v0.t ; RV32-NEXT: vsub.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 48 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index bb30de41c4479..28df7f083c4a0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -2182,16 +2182,16 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB34_2 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bltu a0, a3, .LBB34_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 +; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB34_2: -; RV32-NEXT: li a2, 1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -2216,7 +2216,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 @@ -2246,7 +2246,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -2272,20 +2272,20 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -2294,10 +2294,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: li a2, 56 +; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 @@ -2313,10 +2313,10 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v16, v8, a2, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: vnot.v v16, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t @@ -2326,8 +2326,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2341,8 +2341,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2354,8 +2354,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2369,8 +2369,8 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -2381,7 +2381,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 @@ -2408,24 +2408,24 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB34_2 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB34_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB34_2: -; RV64-NEXT: li a2, 1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: li a1, 1 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a3, a1, 32 -; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vand.vx v16, v16, a2, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: lui a3, 209715 ; RV64-NEXT: addiw a3, a3, 819 @@ -2462,11 +2462,11 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vand.vx v16, v16, a2, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: vand.vx v16, v8, a3, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -4827,16 +4827,16 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: li a2, 16 +; RV32-NEXT: li a3, 16 ; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: bltu a0, a2, .LBB70_2 +; RV32-NEXT: mv a2, a0 +; RV32-NEXT: bltu a0, a3, .LBB70_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 +; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB70_2: -; RV32-NEXT: li a2, 1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: li a1, 1 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a1, v0.t ; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb @@ -4861,7 +4861,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 @@ -4891,7 +4891,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 @@ -4917,20 +4917,20 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 24 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a3, sp, 16 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a3), zero @@ -4939,10 +4939,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t +; RV32-NEXT: li a2, 56 +; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 @@ -4958,10 +4958,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v16, v8, a2, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: vnot.v v16, v16, v0.t +; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t @@ -4971,8 +4971,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4986,8 +4986,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -4999,8 +4999,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 48 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -5014,8 +5014,8 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a0, a0, a2 +; RV32-NEXT: li a1, 40 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload @@ -5026,7 +5026,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 @@ -5053,24 +5053,24 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: li a2, 16 +; RV64-NEXT: li a1, 16 ; RV64-NEXT: vslidedown.vi v24, v0, 2 -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: bltu a0, a2, .LBB70_2 +; RV64-NEXT: mv a2, a0 +; RV64-NEXT: bltu a0, a1, .LBB70_2 ; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a1, 16 +; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB70_2: -; RV64-NEXT: li a2, 1 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: li a1, 1 +; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: addiw a1, a1, 1365 -; RV64-NEXT: slli a3, a1, 32 -; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: addiw a2, a2, 1365 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: vand.vx v16, v16, a2, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: lui a3, 209715 ; RV64-NEXT: addiw a3, a3, 819 @@ -5107,11 +5107,11 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsub.vx v16, v8, a2, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1, v0.t ; RV64-NEXT: vnot.v v8, v8, v0.t ; RV64-NEXT: vand.vv v8, v8, v16, v0.t ; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vand.vx v16, v16, a2, v0.t ; RV64-NEXT: vsub.vv v8, v8, v16, v0.t ; RV64-NEXT: vand.vx v16, v8, a3, v0.t ; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll index 7a26bf2bfdf0c..84b3e142d5aea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll @@ -777,7 +777,7 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll index 4e60edf058450..84ef9283802b9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll @@ -1031,72 +1031,80 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vslidedown.vi v16, v8, 1 ; RV32-NEXT: vsrl.vx v24, v16, a1 -; RV32-NEXT: vmv.x.s a5, v24 -; RV32-NEXT: vmv.x.s a6, v16 +; RV32-NEXT: vmv.x.s s11, v24 +; RV32-NEXT: vmv.x.s ra, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 2 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s a3, v24 ; RV32-NEXT: vmv.x.s a4, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 3 ; RV32-NEXT: vsrl.vx v24, v16, a1 +; RV32-NEXT: vmv.x.s s0, v24 +; RV32-NEXT: vmv.x.s a5, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 4 +; RV32-NEXT: vsrl.vx v24, v16, a1 +; RV32-NEXT: vmv.x.s s1, v24 +; RV32-NEXT: vmv.x.s a6, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 5 +; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s2, v24 ; RV32-NEXT: vmv.x.s a7, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 4 +; RV32-NEXT: vslidedown.vi v16, v8, 6 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s3, v24 ; RV32-NEXT: vmv.x.s t0, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 5 +; RV32-NEXT: vslidedown.vi v16, v8, 7 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s4, v24 ; RV32-NEXT: vmv.x.s t1, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 6 +; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s5, v24 ; RV32-NEXT: vmv.x.s t2, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 7 +; RV32-NEXT: vslidedown.vi v16, v8, 9 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s6, v24 ; RV32-NEXT: vmv.x.s t3, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 8 +; RV32-NEXT: vslidedown.vi v16, v8, 10 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s7, v24 ; RV32-NEXT: vmv.x.s t4, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 9 +; RV32-NEXT: vslidedown.vi v16, v8, 11 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s8, v24 ; RV32-NEXT: vmv.x.s t5, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 10 +; RV32-NEXT: vslidedown.vi v16, v8, 12 ; RV32-NEXT: vsrl.vx v24, v16, a1 ; RV32-NEXT: vmv.x.s s9, v24 ; RV32-NEXT: vmv.x.s t6, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 11 -; RV32-NEXT: vsrl.vx v24, v16, a1 -; RV32-NEXT: vmv.x.s s10, v24 -; RV32-NEXT: vmv.x.s s0, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 12 -; RV32-NEXT: vsrl.vx v24, v16, a1 -; RV32-NEXT: vmv.x.s s11, v24 -; RV32-NEXT: vmv.x.s s1, v16 -; RV32-NEXT: vslidedown.vi v0, v8, 13 -; RV32-NEXT: vsrl.vx v16, v0, a1 -; RV32-NEXT: vmv.x.s ra, v16 +; RV32-NEXT: vslidedown.vi v24, v8, 13 +; RV32-NEXT: vsrl.vx v16, v24, a1 +; RV32-NEXT: vmv.x.s s10, v16 ; RV32-NEXT: vslidedown.vi v16, v8, 14 -; RV32-NEXT: vsrl.vx v24, v16, a1 +; RV32-NEXT: vsrl.vx v0, v16, a1 ; RV32-NEXT: vslidedown.vi v8, v8, 15 -; RV32-NEXT: vmv.x.s a2, v0 -; RV32-NEXT: vsrl.vx v0, v8, a1 +; RV32-NEXT: vmv.x.s a2, v24 +; RV32-NEXT: vsrl.vx v24, v8, a1 ; RV32-NEXT: lw a1, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: add a5, a1, a5 -; RV32-NEXT: add a6, a0, a6 -; RV32-NEXT: sltu a0, a6, a0 -; RV32-NEXT: add a0, a5, a0 +; RV32-NEXT: add s11, a1, s11 +; RV32-NEXT: add ra, a0, ra +; RV32-NEXT: sltu a0, ra, a0 +; RV32-NEXT: add a0, s11, a0 ; RV32-NEXT: add a0, a0, a3 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: sltu a1, a4, a6 +; RV32-NEXT: add a4, ra, a4 +; RV32-NEXT: sltu a1, a4, ra +; RV32-NEXT: add a1, a1, s0 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a5, a4, a5 +; RV32-NEXT: sltu a1, a5, a4 +; RV32-NEXT: add a1, a1, s1 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a6, a5, a6 +; RV32-NEXT: sltu a1, a6, a5 ; RV32-NEXT: add a1, a1, s2 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a7, a4, a7 -; RV32-NEXT: sltu a1, a7, a4 +; RV32-NEXT: add a7, a6, a7 +; RV32-NEXT: sltu a1, a7, a6 ; RV32-NEXT: add a1, a1, s3 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add t0, a7, t0 @@ -1127,21 +1135,13 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: sltu a1, t6, t5 ; RV32-NEXT: add a1, a1, s10 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add s0, t6, s0 -; RV32-NEXT: sltu a1, s0, t6 -; RV32-NEXT: add a1, a1, s11 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add s1, s0, s1 -; RV32-NEXT: sltu a1, s1, s0 -; RV32-NEXT: add a1, a1, ra -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vmv.x.s a1, v24 -; RV32-NEXT: add a2, s1, a2 -; RV32-NEXT: sltu a3, a2, s1 +; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: add a2, t6, a2 +; RV32-NEXT: sltu a3, a2, t6 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: vmv.x.s a3, v16 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: vmv.x.s a1, v24 ; RV32-NEXT: add a3, a2, a3 ; RV32-NEXT: sltu a2, a3, a2 ; RV32-NEXT: add a1, a2, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index ed8ebe83b89af..b3099f6b57056 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -141,7 +141,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 38 +; RV32-NEXT: li a5, 29 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -149,13 +149,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vid.v v10 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 +; RV32-NEXT: slli a5, a4, 3 +; RV32-NEXT: add a4, a5, a4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs2r.v v10, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vadd.vi v8, v10, -4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 12 +; RV32-NEXT: li a5, 13 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -163,7 +164,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 18 +; RV32-NEXT: li a5, 21 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -173,15 +174,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a4, 12 ; RV32-NEXT: vmv.s.x v0, a4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 22 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 46 +; RV32-NEXT: li a5, 45 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -189,7 +189,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 26 +; RV32-NEXT: li a5, 25 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 @@ -211,14 +211,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v16, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 54 +; RV32-NEXT: li a4, 37 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 30 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -226,8 +226,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a6, -64 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill @@ -242,28 +242,29 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 26 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 26 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vi v8, v10, -2 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -271,29 +272,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vadd.vi v8, v10, -8 -; RV32-NEXT: vmv2r.v v22, v10 +; RV32-NEXT: vmv2r.v v30, v10 ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v20, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv1r.v v0, v20 +; RV32-NEXT: vl1r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv1r.v v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v24, v8, v0.t -; RV32-NEXT: vmv.v.v v16, v12 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; RV32-NEXT: vmv.v.v v24, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_3) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_3) -; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 @@ -301,66 +301,66 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 54 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vrgatherei16.vv v8, v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 30 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v24, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v24, v8 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v4, v16, v8 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vadd.vi v8, v22, -6 +; RV32-NEXT: vadd.vi v8, v30, -6 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; RV32-NEXT: vmv1r.v v0, v20 +; RV32-NEXT: vmv1r.v v0, v28 +; RV32-NEXT: vmv1r.v v2, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: vmv.v.v v16, v12 +; RV32-NEXT: vrgatherei16.vv v4, v16, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu @@ -369,77 +369,68 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vle16.v v20, (a1) ; RV32-NEXT: vle16.v v8, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v24, a1 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs1r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v1, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 54 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v20 -; RV32-NEXT: vmv1r.v v0, v24 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v8, v20 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 30 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v4, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a3, a1, 3 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8 +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.v v4, v12 ; RV32-NEXT: lui a1, %hi(.LCPI6_8) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_8) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu @@ -448,162 +439,161 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: vle16.v v20, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 54 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v16 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv.v.v v12, v8 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v4, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 22 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: lui a1, %hi(.LCPI6_10) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: lui a1, 15 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v14, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v14 +; RV32-NEXT: vl2r.v v10, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v20, v16, v10 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v12, v0.t +; RV32-NEXT: vrgatherei16.vv v20, v24, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_11) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_11) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a3, %hi(.LCPI6_12) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_12) -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v24, (a1) ; RV32-NEXT: vle16.v v16, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 1008 -; RV32-NEXT: vmv.s.x v16, a1 +; RV32-NEXT: vmv.s.x v28, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 38 +; RV32-NEXT: li a3, 29 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs1r.v v28, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 54 +; RV32-NEXT: li a3, 37 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v0, v12 -; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vmv1r.v v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 30 +; RV32-NEXT: li a3, 53 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v12, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v8, v24 +; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: lui a1, %hi(.LCPI6_13) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_13) ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 18 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 46 +; RV32-NEXT: li a3, 45 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v12, v24, v16, v0.t +; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: lui a1, %hi(.LCPI6_14) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_14) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu ; RV32-NEXT: lui a2, %hi(.LCPI6_15) ; RV32-NEXT: addi a2, a2, %lo(.LCPI6_15) -; RV32-NEXT: vle16.v v0, (a1) -; RV32-NEXT: vle16.v v4, (a2) +; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v8, (a2) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 54 +; RV32-NEXT: li a2, 45 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v0 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 38 +; RV32-NEXT: li a2, 37 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 29 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 30 +; RV32-NEXT: li a2, 53 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v4, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 45 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v12, v24 +; RV32-NEXT: vmv.v.v v16, v8 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v12, (a1) +; RV32-NEXT: vse32.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 22 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 @@ -611,21 +601,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a3, a2, 3 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a3, a2, 4 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 26 +; RV32-NEXT: li a2, 25 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -843,7 +834,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vrgatherei16.vv v8, v16, v28 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v28, v2, -13 +; RV64-NEXT: vadd.vi v16, v2, -13 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV64-NEXT: vmv1r.v v0, v4 ; RV64-NEXT: csrr a1, vlenb @@ -851,8 +842,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v28, v0.t +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV64-NEXT: lui a1, 16 ; RV64-NEXT: addiw a1, a1, 7 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index ac5c11ca88df5..112c0a0e598d0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -6893,89 +6893,89 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-LABEL: mgather_baseidx_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a7, v0 -; RV64ZVE32F-NEXT: andi a4, a7, 1 +; RV64ZVE32F-NEXT: vmv.x.s a6, v0 +; RV64ZVE32F-NEXT: andi a4, a6, 1 ; RV64ZVE32F-NEXT: beqz a4, .LBB57_9 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a4, 0(a2) ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) -; RV64ZVE32F-NEXT: andi a5, a7, 2 +; RV64ZVE32F-NEXT: andi a5, a6, 2 ; RV64ZVE32F-NEXT: bnez a5, .LBB57_10 ; RV64ZVE32F-NEXT: .LBB57_2: ; RV64ZVE32F-NEXT: ld a5, 8(a3) -; RV64ZVE32F-NEXT: andi a6, a7, 4 -; RV64ZVE32F-NEXT: bnez a6, .LBB57_11 +; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: bnez a7, .LBB57_11 ; RV64ZVE32F-NEXT: .LBB57_3: -; RV64ZVE32F-NEXT: ld a6, 16(a3) -; RV64ZVE32F-NEXT: andi t0, a7, 8 +; RV64ZVE32F-NEXT: ld a7, 16(a3) +; RV64ZVE32F-NEXT: andi t0, a6, 8 ; RV64ZVE32F-NEXT: bnez t0, .LBB57_12 ; RV64ZVE32F-NEXT: .LBB57_4: ; RV64ZVE32F-NEXT: ld t0, 24(a3) -; RV64ZVE32F-NEXT: andi t1, a7, 16 +; RV64ZVE32F-NEXT: andi t1, a6, 16 ; RV64ZVE32F-NEXT: bnez t1, .LBB57_13 ; RV64ZVE32F-NEXT: .LBB57_5: ; RV64ZVE32F-NEXT: ld t1, 32(a3) -; RV64ZVE32F-NEXT: andi t2, a7, 32 +; RV64ZVE32F-NEXT: andi t2, a6, 32 ; RV64ZVE32F-NEXT: bnez t2, .LBB57_14 ; RV64ZVE32F-NEXT: .LBB57_6: ; RV64ZVE32F-NEXT: ld t2, 40(a3) -; RV64ZVE32F-NEXT: andi t3, a7, 64 +; RV64ZVE32F-NEXT: andi t3, a6, 64 ; RV64ZVE32F-NEXT: bnez t3, .LBB57_15 ; RV64ZVE32F-NEXT: .LBB57_7: ; RV64ZVE32F-NEXT: ld t3, 48(a3) -; RV64ZVE32F-NEXT: andi a7, a7, -128 -; RV64ZVE32F-NEXT: bnez a7, .LBB57_16 +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: bnez a6, .LBB57_16 ; RV64ZVE32F-NEXT: .LBB57_8: ; RV64ZVE32F-NEXT: ld a1, 56(a3) ; RV64ZVE32F-NEXT: j .LBB57_17 ; RV64ZVE32F-NEXT: .LBB57_9: ; RV64ZVE32F-NEXT: ld a4, 0(a3) -; RV64ZVE32F-NEXT: andi a5, a7, 2 +; RV64ZVE32F-NEXT: andi a5, a6, 2 ; RV64ZVE32F-NEXT: beqz a5, .LBB57_2 ; RV64ZVE32F-NEXT: .LBB57_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a5, 8(a2) ; RV64ZVE32F-NEXT: slli a5, a5, 3 ; RV64ZVE32F-NEXT: add a5, a1, a5 ; RV64ZVE32F-NEXT: ld a5, 0(a5) -; RV64ZVE32F-NEXT: andi a6, a7, 4 -; RV64ZVE32F-NEXT: beqz a6, .LBB57_3 +; RV64ZVE32F-NEXT: andi a7, a6, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB57_3 ; RV64ZVE32F-NEXT: .LBB57_11: # %cond.load4 -; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: slli a6, a6, 3 -; RV64ZVE32F-NEXT: add a6, a1, a6 -; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi t0, a7, 8 +; RV64ZVE32F-NEXT: ld a7, 16(a2) +; RV64ZVE32F-NEXT: slli a7, a7, 3 +; RV64ZVE32F-NEXT: add a7, a1, a7 +; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: andi t0, a6, 8 ; RV64ZVE32F-NEXT: beqz t0, .LBB57_4 ; RV64ZVE32F-NEXT: .LBB57_12: # %cond.load7 ; RV64ZVE32F-NEXT: ld t0, 24(a2) ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a7, 16 +; RV64ZVE32F-NEXT: andi t1, a6, 16 ; RV64ZVE32F-NEXT: beqz t1, .LBB57_5 ; RV64ZVE32F-NEXT: .LBB57_13: # %cond.load10 ; RV64ZVE32F-NEXT: ld t1, 32(a2) ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: andi t2, a7, 32 +; RV64ZVE32F-NEXT: andi t2, a6, 32 ; RV64ZVE32F-NEXT: beqz t2, .LBB57_6 ; RV64ZVE32F-NEXT: .LBB57_14: # %cond.load13 ; RV64ZVE32F-NEXT: ld t2, 40(a2) ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi t3, a7, 64 +; RV64ZVE32F-NEXT: andi t3, a6, 64 ; RV64ZVE32F-NEXT: beqz t3, .LBB57_7 ; RV64ZVE32F-NEXT: .LBB57_15: # %cond.load16 ; RV64ZVE32F-NEXT: ld t3, 48(a2) ; RV64ZVE32F-NEXT: slli t3, t3, 3 ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: ld t3, 0(t3) -; RV64ZVE32F-NEXT: andi a7, a7, -128 -; RV64ZVE32F-NEXT: beqz a7, .LBB57_8 +; RV64ZVE32F-NEXT: andi a6, a6, -128 +; RV64ZVE32F-NEXT: beqz a6, .LBB57_8 ; RV64ZVE32F-NEXT: .LBB57_16: # %cond.load19 ; RV64ZVE32F-NEXT: ld a2, 56(a2) ; RV64ZVE32F-NEXT: slli a2, a2, 3 @@ -6984,7 +6984,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: .LBB57_17: # %else20 ; RV64ZVE32F-NEXT: sd a4, 0(a0) ; RV64ZVE32F-NEXT: sd a5, 8(a0) -; RV64ZVE32F-NEXT: sd a6, 16(a0) +; RV64ZVE32F-NEXT: sd a7, 16(a0) ; RV64ZVE32F-NEXT: sd t0, 24(a0) ; RV64ZVE32F-NEXT: sd t1, 32(a0) ; RV64ZVE32F-NEXT: sd t2, 40(a0) @@ -13024,19 +13024,19 @@ define <4 x i32> @mgather_narrow_edge_case(ptr %base) { define <8 x i16> @mgather_strided_2xSEW(ptr %base) { ; RV32-LABEL: mgather_strided_2xSEW: ; RV32: # %bb.0: -; RV32-NEXT: lui a1, %hi(.LCPI107_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI107_0) -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle8.v v9, (a1) +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vid.v v8 +; RV32-NEXT: vsll.vi v9, v8, 3 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vluxei8.v v8, (a0), v9 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_strided_2xSEW: ; RV64V: # %bb.0: -; RV64V-NEXT: lui a1, %hi(.LCPI107_0) -; RV64V-NEXT: addi a1, a1, %lo(.LCPI107_0) -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vle8.v v9, (a1) +; RV64V-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64V-NEXT: vid.v v8 +; RV64V-NEXT: vsll.vi v9, v8, 3 +; RV64V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64V-NEXT: vluxei8.v v8, (a0), v9 ; RV64V-NEXT: ret ; @@ -13141,19 +13141,19 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) { define <8 x i16> @mgather_gather_2xSEW(ptr %base) { ; RV32-LABEL: mgather_gather_2xSEW: ; RV32: # %bb.0: -; RV32-NEXT: lui a1, %hi(.LCPI108_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI108_0) -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle8.v v9, (a1) +; RV32-NEXT: lui a1, 82176 +; RV32-NEXT: addi a1, a1, 1024 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v9, a1 ; RV32-NEXT: vluxei8.v v8, (a0), v9 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_gather_2xSEW: ; RV64V: # %bb.0: -; RV64V-NEXT: lui a1, %hi(.LCPI108_0) -; RV64V-NEXT: addi a1, a1, %lo(.LCPI108_0) -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vle8.v v9, (a1) +; RV64V-NEXT: lui a1, 82176 +; RV64V-NEXT: addiw a1, a1, 1024 +; RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64V-NEXT: vmv.s.x v9, a1 ; RV64V-NEXT: vluxei8.v v8, (a0), v9 ; RV64V-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 7309fb9edec06..60b61e889315c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -11296,9 +11296,8 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, define void @mscatter_unit_stride(<8 x i16> %val, ptr %base) { ; CHECK-LABEL: mscatter_unit_stride: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i16 0 %allones = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -11311,9 +11310,8 @@ define void @mscatter_unit_stride_with_offset(<8 x i16> %val, ptr %base) { ; CHECK-LABEL: mscatter_unit_stride_with_offset: ; CHECK: # %bb.0: ; CHECK-NEXT: addi a0, a0, 10 -; CHECK-NEXT: li a1, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vsse16.v v8, (a0), a1 +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i16 0 %allones = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll index f09485c2fcbe2..b3011d0f01cab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll @@ -401,41 +401,54 @@ define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 18 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: addi a3, a2, 128 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a3) ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a4, a3, 3 +; RV32-NEXT: add a3, a4, a3 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vle64.v v24, (a2) +; RV32-NEXT: vle64.v v0, (a2) ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vmseq.vv v1, v24, v8 +; RV32-NEXT: vmseq.vv v8, v0, v24 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vs1r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, a0, 128 -; RV32-NEXT: vle64.v v24, (a2) +; RV32-NEXT: vle64.v v8, (a2) ; RV32-NEXT: vle64.v v16, (a0) ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a2, a0, 3 +; RV32-NEXT: add a0, a2, a0 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmseq.vv v0, v16, v8 +; RV32-NEXT: vmseq.vv v0, v16, v24 ; RV32-NEXT: addi a0, a1, 128 -; RV32-NEXT: vse64.v v24, (a0), v0.t -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vse64.v v8, (a0), v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vse64.v v8, (a1), v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 18 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index cba78368b2e7d..d9958f4aae350 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -595,7 +595,15 @@ declare <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 ; CHECK-NEXT: vslidedown.vi v1, v0, 2 @@ -607,35 +615,43 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: frflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfabs.v v16, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v24, v16, v24, v0.t +; CHECK-NEXT: vmv.v.v v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll index 6982a7cb5cb5e..3e0fb3009c6b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll @@ -543,54 +543,65 @@ declare <32 x double> @llvm.vp.rint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v32f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI26_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll index 8e5f8cfe0570b..504982111d055 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll @@ -777,7 +777,7 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll index 0cbde264378ff..35480164d4a12 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll @@ -777,7 +777,7 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll index 24481e4d77a01..4928eba52ac8c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll @@ -777,7 +777,7 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB26_2 ; CHECK-NEXT: # %bb.1: @@ -798,32 +798,32 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll index aae047260163f..e558d45a3b2d7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -634,7 +634,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v2, v0 +; CHECK-NEXT: vmv1r.v v1, v0 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 @@ -653,13 +653,13 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: and a2, a4, a2 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vmseq.vv v1, v16, v8, v0.t +; CHECK-NEXT: vmseq.vv v2, v16, v8, v0.t ; CHECK-NEXT: bltu a3, a1, .LBB51_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB51_2: ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -669,7 +669,7 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t ; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vmv1r.v v8, v1 +; CHECK-NEXT: vmv1r.v v8, v2 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index 96100d2b62e41..2ae031798f5bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -96,6 +96,16 @@ define <8 x i8> @strided_vpload_v8i8(ptr %ptr, i32 signext %stride, <8 x i1> %m, ret <8 x i8> %load } +define <8 x i8> @strided_vpload_v8i8_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_v8i8_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i32(ptr %ptr, i32 1, <8 x i1> %m, i32 %evl) + ret <8 x i8> %load +} + declare <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i32(ptr, i32, <2 x i1>, i32) define <2 x i16> @strided_vpload_v2i16(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) { @@ -132,6 +142,16 @@ define <8 x i16> @strided_vpload_v8i16(ptr %ptr, i32 signext %stride, <8 x i1> % ret <8 x i16> %load } +define <8 x i16> @strided_vpload_v8i16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_v8i16_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <8 x i16> @llvm.experimental.vp.strided.load.v8i16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl) + ret <8 x i16> %load +} + define <8 x i16> @strided_vpload_v8i16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { ; CHECK-LABEL: strided_vpload_v8i16_allones_mask: ; CHECK: # %bb.0: @@ -168,6 +188,16 @@ define <4 x i32> @strided_vpload_v4i32(ptr %ptr, i32 signext %stride, <4 x i1> % ret <4 x i32> %load } +define <4 x i32> @strided_vpload_v4i32_unit_stride(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_v4i32_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i32(ptr %ptr, i32 4, <4 x i1> %m, i32 %evl) + ret <4 x i32> %load +} + declare <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i32(ptr, i32, <8 x i1>, i32) define <8 x i32> @strided_vpload_v8i32(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) { @@ -204,6 +234,16 @@ define <2 x i64> @strided_vpload_v2i64(ptr %ptr, i32 signext %stride, <2 x i1> % ret <2 x i64> %load } +define <2 x i64> @strided_vpload_v2i64_unit_stride(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_v2i64_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i32(ptr %ptr, i32 8, <2 x i1> %m, i32 %evl) + ret <2 x i64> %load +} + declare <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr, i32, <4 x i1>, i32) define <4 x i64> @strided_vpload_v4i64(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) { @@ -288,6 +328,16 @@ define <8 x half> @strided_vpload_v8f16(ptr %ptr, i32 signext %stride, <8 x i1> ret <8 x half> %load } +define <8 x half> @strided_vpload_v8f16_unit_stride(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_v8f16_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i32(ptr %ptr, i32 2, <8 x i1> %m, i32 %evl) + ret <8 x half> %load +} + declare <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i32(ptr, i32, <2 x i1>, i32) define <2 x float> @strided_vpload_v2f32(ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) { @@ -312,6 +362,16 @@ define <4 x float> @strided_vpload_v4f32(ptr %ptr, i32 signext %stride, <4 x i1> ret <4 x float> %load } +define <4 x float> @strided_vpload_v4f32_unit_stride(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_v4f32_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <4 x float> @llvm.experimental.vp.strided.load.v4f32.p0.i32(ptr %ptr, i32 4, <4 x i1> %m, i32 %evl) + ret <4 x float> %load +} + declare <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i32(ptr, i32, <8 x i1>, i32) define <8 x float> @strided_vpload_v8f32(ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) { @@ -348,6 +408,17 @@ define <2 x double> @strided_vpload_v2f64(ptr %ptr, i32 signext %stride, <2 x i1 ret <2 x double> %load } +define <2 x double> @strided_vpload_v2f64_unit_stride(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_v2f64_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call <2 x double> @llvm.experimental.vp.strided.load.v2f64.p0.i32(ptr %ptr, i32 8, <2 x i1> %m, i32 %evl) + ret <2 x double> %load +} + + declare <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i32(ptr, i32, <4 x i1>, i32) define <4 x double> @strided_vpload_v4f64(ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) { @@ -416,10 +487,10 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: bltu a2, a4, .LBB33_2 +; CHECK-NEXT: bltu a2, a4, .LBB40_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 16 -; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: .LBB40_2: ; CHECK-NEXT: mul a4, a3, a1 ; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: addi a5, a2, -16 @@ -444,10 +515,10 @@ define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext % ; CHECK: # %bb.0: ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: bltu a2, a4, .LBB34_2 +; CHECK-NEXT: bltu a2, a4, .LBB41_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 16 -; CHECK-NEXT: .LBB34_2: +; CHECK-NEXT: .LBB41_2: ; CHECK-NEXT: mul a4, a3, a1 ; CHECK-NEXT: add a4, a0, a4 ; CHECK-NEXT: addi a5, a2, -16 @@ -474,10 +545,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV32-NEXT: li a5, 32 ; CHECK-RV32-NEXT: vmv1r.v v8, v0 ; CHECK-RV32-NEXT: mv a3, a4 -; CHECK-RV32-NEXT: bltu a4, a5, .LBB35_2 +; CHECK-RV32-NEXT: bltu a4, a5, .LBB42_2 ; CHECK-RV32-NEXT: # %bb.1: ; CHECK-RV32-NEXT: li a3, 32 -; CHECK-RV32-NEXT: .LBB35_2: +; CHECK-RV32-NEXT: .LBB42_2: ; CHECK-RV32-NEXT: mul a5, a3, a2 ; CHECK-RV32-NEXT: addi a6, a4, -32 ; CHECK-RV32-NEXT: sltu a4, a4, a6 @@ -485,10 +556,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV32-NEXT: and a6, a4, a6 ; CHECK-RV32-NEXT: li a4, 16 ; CHECK-RV32-NEXT: add a5, a1, a5 -; CHECK-RV32-NEXT: bltu a6, a4, .LBB35_4 +; CHECK-RV32-NEXT: bltu a6, a4, .LBB42_4 ; CHECK-RV32-NEXT: # %bb.3: ; CHECK-RV32-NEXT: li a6, 16 -; CHECK-RV32-NEXT: .LBB35_4: +; CHECK-RV32-NEXT: .LBB42_4: ; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v0, v8, 4 ; CHECK-RV32-NEXT: vsetvli zero, a6, e64, m8, ta, ma @@ -497,10 +568,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV32-NEXT: sltu a6, a3, a5 ; CHECK-RV32-NEXT: addi a6, a6, -1 ; CHECK-RV32-NEXT: and a5, a6, a5 -; CHECK-RV32-NEXT: bltu a3, a4, .LBB35_6 +; CHECK-RV32-NEXT: bltu a3, a4, .LBB42_6 ; CHECK-RV32-NEXT: # %bb.5: ; CHECK-RV32-NEXT: li a3, 16 -; CHECK-RV32-NEXT: .LBB35_6: +; CHECK-RV32-NEXT: .LBB42_6: ; CHECK-RV32-NEXT: mul a4, a3, a2 ; CHECK-RV32-NEXT: add a4, a1, a4 ; CHECK-RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma @@ -524,10 +595,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV64-NEXT: li a5, 32 ; CHECK-RV64-NEXT: vmv1r.v v8, v0 ; CHECK-RV64-NEXT: mv a4, a3 -; CHECK-RV64-NEXT: bltu a3, a5, .LBB35_2 +; CHECK-RV64-NEXT: bltu a3, a5, .LBB42_2 ; CHECK-RV64-NEXT: # %bb.1: ; CHECK-RV64-NEXT: li a4, 32 -; CHECK-RV64-NEXT: .LBB35_2: +; CHECK-RV64-NEXT: .LBB42_2: ; CHECK-RV64-NEXT: mul a5, a4, a2 ; CHECK-RV64-NEXT: addi a6, a3, -32 ; CHECK-RV64-NEXT: sltu a3, a3, a6 @@ -535,10 +606,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV64-NEXT: and a6, a3, a6 ; CHECK-RV64-NEXT: li a3, 16 ; CHECK-RV64-NEXT: add a5, a1, a5 -; CHECK-RV64-NEXT: bltu a6, a3, .LBB35_4 +; CHECK-RV64-NEXT: bltu a6, a3, .LBB42_4 ; CHECK-RV64-NEXT: # %bb.3: ; CHECK-RV64-NEXT: li a6, 16 -; CHECK-RV64-NEXT: .LBB35_4: +; CHECK-RV64-NEXT: .LBB42_4: ; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v0, v8, 4 ; CHECK-RV64-NEXT: vsetvli zero, a6, e64, m8, ta, ma @@ -547,10 +618,10 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask ; CHECK-RV64-NEXT: sltu a6, a4, a5 ; CHECK-RV64-NEXT: addi a6, a6, -1 ; CHECK-RV64-NEXT: and a5, a6, a5 -; CHECK-RV64-NEXT: bltu a4, a3, .LBB35_6 +; CHECK-RV64-NEXT: bltu a4, a3, .LBB42_6 ; CHECK-RV64-NEXT: # %bb.5: ; CHECK-RV64-NEXT: li a4, 16 -; CHECK-RV64-NEXT: .LBB35_6: +; CHECK-RV64-NEXT: .LBB42_6: ; CHECK-RV64-NEXT: mul a3, a4, a2 ; CHECK-RV64-NEXT: add a3, a1, a3 ; CHECK-RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll index 781be5f607da1..6c4960bd40784 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll @@ -84,6 +84,16 @@ define void @strided_vpstore_v8i8(<8 x i8> %val, ptr %ptr, i32 signext %stride, ret void } +define void @strided_vpstore_v8i8_unit_stride(<8 x i8> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_v8i8_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.v8i8.p0.i32(<8 x i8> %val, ptr %ptr, i32 1, <8 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.v2i16.p0.i32(<2 x i16>, ptr, i32, <2 x i1>, i32) define void @strided_vpstore_v2i16(<2 x i16> %val, ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) { @@ -120,6 +130,16 @@ define void @strided_vpstore_v8i16(<8 x i16> %val, ptr %ptr, i32 signext %stride ret void } +define void @strided_vpstore_v8i16_unit_stride(<8 x i16> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_v8i16_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.v8i16.p0.i32(<8 x i16> %val, ptr %ptr, i32 2, <8 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.v2i32.p0.i32(<2 x i32>, ptr, i32, <2 x i1>, i32) define void @strided_vpstore_v2i32(<2 x i32> %val, ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) { @@ -144,6 +164,16 @@ define void @strided_vpstore_v4i32(<4 x i32> %val, ptr %ptr, i32 signext %stride ret void } +define void @strided_vpstore_v4i32_unit_stride(<4 x i32> %val, ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_v4i32_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.v4i32.p0.i32(<4 x i32> %val, ptr %ptr, i32 4, <4 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.v8i32.p0.i32(<8 x i32>, ptr, i32, <8 x i1>, i32) define void @strided_vpstore_v8i32(<8 x i32> %val, ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) { @@ -168,6 +198,16 @@ define void @strided_vpstore_v2i64(<2 x i64> %val, ptr %ptr, i32 signext %stride ret void } +define void @strided_vpstore_v2i64_unit_stride(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_v2i64_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.v2i64.p0.i32(<2 x i64> %val, ptr %ptr, i32 8, <2 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.v4i64.p0.i32(<4 x i64>, ptr, i32, <4 x i1>, i32) define void @strided_vpstore_v4i64(<4 x i64> %val, ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) { @@ -228,6 +268,16 @@ define void @strided_vpstore_v8f16(<8 x half> %val, ptr %ptr, i32 signext %strid ret void } +define void @strided_vpstore_v8f16_unit_stride(<8 x half> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_v8f16_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.v8f16.p0.i32(<8 x half> %val, ptr %ptr, i32 2, <8 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.v2f32.p0.i32(<2 x float>, ptr, i32, <2 x i1>, i32) define void @strided_vpstore_v2f32(<2 x float> %val, ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) { @@ -252,6 +302,16 @@ define void @strided_vpstore_v4f32(<4 x float> %val, ptr %ptr, i32 signext %stri ret void } +define void @strided_vpstore_v4f32_unit_stride(<4 x float> %val, ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_v4f32_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.v4f32.p0.i32(<4 x float> %val, ptr %ptr, i32 4, <4 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.v8f32.p0.i32(<8 x float>, ptr, i32, <8 x i1>, i32) define void @strided_vpstore_v8f32(<8 x float> %val, ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) { @@ -276,6 +336,16 @@ define void @strided_vpstore_v2f64(<2 x double> %val, ptr %ptr, i32 signext %str ret void } +define void @strided_vpstore_v2f64_unit_stride(<2 x double> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_v2f64_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.v2f64.p0.i32(<2 x double> %val, ptr %ptr, i32 8, <2 x i1> %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.v4f64.p0.i32(<4 x double>, ptr, i32, <4 x i1>, i32) define void @strided_vpstore_v4f64(<4 x double> %val, ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) { @@ -343,10 +413,10 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid ; CHECK: # %bb.0: ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: bltu a2, a4, .LBB27_2 +; CHECK-NEXT: bltu a2, a4, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 16 -; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t ; CHECK-NEXT: mul a3, a3, a1 @@ -369,10 +439,10 @@ define void @strided_store_v32f64_allones_mask(<32 x double> %v, ptr %ptr, i32 s ; CHECK: # %bb.0: ; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: mv a3, a2 -; CHECK-NEXT: bltu a2, a4, .LBB28_2 +; CHECK-NEXT: bltu a2, a4, .LBB35_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 16 -; CHECK-NEXT: .LBB28_2: +; CHECK-NEXT: .LBB35_2: ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vsse64.v v8, (a0), a1 ; CHECK-NEXT: mul a3, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll index 9212b4047d97b..12d96fbfb88d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll @@ -8,18 +8,18 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 40 +; CHECK-NEXT: li a4, 48 ; CHECK-NEXT: mul a2, a2, a4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: li a4, 24 -; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 5 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 5 +; CHECK-NEXT: li a4, 40 +; CHECK-NEXT: mul a2, a2, a4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill @@ -29,70 +29,82 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 { ; CHECK-NEXT: addi a4, a3, 384 ; CHECK-NEXT: vle8.v v8, (a4) ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: li a5, 24 +; CHECK-NEXT: mul a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill ; CHECK-NEXT: addi a4, a1, 128 ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: addi a1, a3, 256 ; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: vle8.v v16, (a4) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v8, (a4) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vle8.v v24, (a2) ; CHECK-NEXT: vle8.v v0, (a3) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v8, v0, v8 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v16, v16, v8 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a2, 24 ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v16, v16, v8 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; CHECK-NEXT: vadd.vv v0, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v24, v8, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a2, 40 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v0, v8, v0 ; CHECK-NEXT: vse8.v v0, (a0) ; CHECK-NEXT: addi a1, a0, 384 ; CHECK-NEXT: vse8.v v16, (a1) ; CHECK-NEXT: addi a1, a0, 256 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vse8.v v16, (a1) +; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: vse8.v v24, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 48 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll index 4da164c4fcaa8..2d348deb939ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -154,38 +154,48 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; CHECK-NEXT: vmv1r.v v2, v8 -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vmv1r.v v8, v0 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v24, (a0) ; CHECK-NEXT: addi a0, a1, 128 -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v16, (a0) ; CHECK-NEXT: addi a0, a3, -128 ; CHECK-NEXT: sltu a4, a3, a0 ; CHECK-NEXT: addi a4, a4, -1 -; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: and a0, a4, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vmerge.vvm v24, v8, v24, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vvm v24, v16, v24, v0 ; CHECK-NEXT: bltu a3, a2, .LBB11_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 128 ; CHECK-NEXT: .LBB11_2: ; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vmv8r.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll index 57e76a354a138..6c4f523aa8d94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll @@ -737,14 +737,16 @@ define @vp_floor_nxv16f64( %va, @vp_floor_nxv16f64( %va, @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i32_mm: ; CHECK-NOV: # %bb.0: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a1, fa3, rtz -; CHECK-NOV-NEXT: li a4, -1 -; CHECK-NOV-NEXT: srli a4, a4, 32 +; CHECK-NOV-NEXT: li a3, -1 +; CHECK-NOV-NEXT: srli a3, a3, 32 ; CHECK-NOV-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB32_6 +; CHECK-NOV-NEXT: bge a1, a3, .LBB32_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB32_7 +; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz +; CHECK-NOV-NEXT: bge a2, a3, .LBB32_7 ; CHECK-NOV-NEXT: .LBB32_2: # %entry ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB32_8 +; CHECK-NOV-NEXT: bge a4, a3, .LBB32_8 ; CHECK-NOV-NEXT: .LBB32_3: # %entry -; CHECK-NOV-NEXT: blt a5, a4, .LBB32_5 +; CHECK-NOV-NEXT: blt a5, a3, .LBB32_5 ; CHECK-NOV-NEXT: .LBB32_4: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB32_5: # %entry -; CHECK-NOV-NEXT: sgtz a4, a5 -; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: and a4, a4, a5 -; CHECK-NOV-NEXT: sgtz a5, a3 +; CHECK-NOV-NEXT: sgtz a3, a5 +; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: and a3, a3, a5 +; CHECK-NOV-NEXT: sgtz a5, a4 ; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: and a3, a5, a3 +; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sgtz a5, a2 ; CHECK-NOV-NEXT: negw a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 @@ -3643,20 +3643,20 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: sw a1, 12(a0) ; CHECK-NOV-NEXT: sw a2, 8(a0) -; CHECK-NOV-NEXT: sw a3, 4(a0) -; CHECK-NOV-NEXT: sw a4, 0(a0) +; CHECK-NOV-NEXT: sw a4, 4(a0) +; CHECK-NOV-NEXT: sw a3, 0(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB32_6: # %entry -; CHECK-NOV-NEXT: mv a1, a4 -; CHECK-NOV-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB32_2 +; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: fcvt.l.s a4, fa1, rtz +; CHECK-NOV-NEXT: blt a2, a3, .LBB32_2 ; CHECK-NOV-NEXT: .LBB32_7: # %entry -; CHECK-NOV-NEXT: mv a2, a4 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB32_3 +; CHECK-NOV-NEXT: blt a4, a3, .LBB32_3 ; CHECK-NOV-NEXT: .LBB32_8: # %entry -; CHECK-NOV-NEXT: mv a3, a4 -; CHECK-NOV-NEXT: bge a5, a4, .LBB32_4 +; CHECK-NOV-NEXT: mv a4, a3 +; CHECK-NOV-NEXT: bge a5, a3, .LBB32_4 ; CHECK-NOV-NEXT: j .LBB32_5 ; ; CHECK-V-LABEL: ustest_f32i32_mm: @@ -4487,27 +4487,27 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-LABEL: ustest_f32i16_mm: ; CHECK-NOV: # %bb.0: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a1, fa3, rtz -; CHECK-NOV-NEXT: lui a4, 16 -; CHECK-NOV-NEXT: addiw a4, a4, -1 +; CHECK-NOV-NEXT: lui a3, 16 +; CHECK-NOV-NEXT: addiw a3, a3, -1 ; CHECK-NOV-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NOV-NEXT: bge a1, a4, .LBB41_6 +; CHECK-NOV-NEXT: bge a1, a3, .LBB41_6 ; CHECK-NOV-NEXT: # %bb.1: # %entry -; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: bge a2, a4, .LBB41_7 +; CHECK-NOV-NEXT: fcvt.w.s a4, fa1, rtz +; CHECK-NOV-NEXT: bge a2, a3, .LBB41_7 ; CHECK-NOV-NEXT: .LBB41_2: # %entry ; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz -; CHECK-NOV-NEXT: bge a3, a4, .LBB41_8 +; CHECK-NOV-NEXT: bge a4, a3, .LBB41_8 ; CHECK-NOV-NEXT: .LBB41_3: # %entry -; CHECK-NOV-NEXT: blt a5, a4, .LBB41_5 +; CHECK-NOV-NEXT: blt a5, a3, .LBB41_5 ; CHECK-NOV-NEXT: .LBB41_4: # %entry -; CHECK-NOV-NEXT: mv a5, a4 +; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB41_5: # %entry -; CHECK-NOV-NEXT: sgtz a4, a5 -; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: and a4, a4, a5 -; CHECK-NOV-NEXT: sgtz a5, a3 +; CHECK-NOV-NEXT: sgtz a3, a5 +; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: and a3, a3, a5 +; CHECK-NOV-NEXT: sgtz a5, a4 ; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: and a3, a5, a3 +; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sgtz a5, a2 ; CHECK-NOV-NEXT: negw a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 @@ -4516,20 +4516,20 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: sh a1, 6(a0) ; CHECK-NOV-NEXT: sh a2, 4(a0) -; CHECK-NOV-NEXT: sh a3, 2(a0) -; CHECK-NOV-NEXT: sh a4, 0(a0) +; CHECK-NOV-NEXT: sh a4, 2(a0) +; CHECK-NOV-NEXT: sh a3, 0(a0) ; CHECK-NOV-NEXT: ret ; CHECK-NOV-NEXT: .LBB41_6: # %entry -; CHECK-NOV-NEXT: mv a1, a4 -; CHECK-NOV-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NOV-NEXT: blt a2, a4, .LBB41_2 +; CHECK-NOV-NEXT: mv a1, a3 +; CHECK-NOV-NEXT: fcvt.w.s a4, fa1, rtz +; CHECK-NOV-NEXT: blt a2, a3, .LBB41_2 ; CHECK-NOV-NEXT: .LBB41_7: # %entry -; CHECK-NOV-NEXT: mv a2, a4 +; CHECK-NOV-NEXT: mv a2, a3 ; CHECK-NOV-NEXT: fcvt.w.s a5, fa0, rtz -; CHECK-NOV-NEXT: blt a3, a4, .LBB41_3 +; CHECK-NOV-NEXT: blt a4, a3, .LBB41_3 ; CHECK-NOV-NEXT: .LBB41_8: # %entry -; CHECK-NOV-NEXT: mv a3, a4 -; CHECK-NOV-NEXT: bge a5, a4, .LBB41_4 +; CHECK-NOV-NEXT: mv a4, a3 +; CHECK-NOV-NEXT: bge a5, a3, .LBB41_4 ; CHECK-NOV-NEXT: j .LBB41_5 ; ; CHECK-V-LABEL: ustest_f32i16_mm: diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll index 0eb69c89f2c44..d79d28d52e73c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll @@ -1037,14 +1037,14 @@ define @fshr_v16i64( %a, @fshl_v16i64( %a, @llvm.experimental.stepvector.nxv1i64() %gep = getelementptr inbounds i64, ptr %p, %step diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll index 409c87c55092d..cf984ff149a5a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll @@ -1079,11 +1079,19 @@ declare @llvm.vp.nearbyint.nxv16f64( @vp_nearbyint_nxv16f64( %va, %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_nxv16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v0, a2 +; CHECK-NEXT: vslidedown.vx v25, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1091,35 +1099,62 @@ define @vp_nearbyint_nxv16f64( %va, ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfabs.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v16, fa5, v0.t ; CHECK-NEXT: frflags a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v8, v16, v0.t ; CHECK-NEXT: fsflags a2 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16f64( %va, %m, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll index a731b40f0ead3..ebb186b197b41 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll @@ -992,14 +992,16 @@ define @vp_rint_nxv16f64( %va, @vp_rint_nxv16f64( %va, @vp_round_nxv16f64( %va, @vp_round_nxv16f64( %va, @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v0, a2 +; CHECK-NEXT: vslidedown.vx v25, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1099,43 +1101,49 @@ define @vp_roundeven_nxv16f64( %va, ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll index 54b326b0b6018..798c7e05bd47b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll @@ -1084,14 +1084,16 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 3 ; CHECK-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v2, v0, a2 +; CHECK-NEXT: vslidedown.vx v25, v0, a2 ; CHECK-NEXT: sub a2, a0, a1 ; CHECK-NEXT: sltu a3, a0, a2 ; CHECK-NEXT: addi a3, a3, -1 @@ -1099,43 +1101,49 @@ define @vp_roundtozero_nxv16f64( %v ; CHECK-NEXT: lui a3, %hi(.LCPI32_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI32_0)(a3) ; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfabs.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfabs.v v8, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vmflt.vf v25, v8, fa5, v0.t ; CHECK-NEXT: fsrmi a2, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v25 +; CHECK-NEXT: vfcvt.x.f.v v8, v16, v0.t ; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vfcvt.f.x.v v8, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; CHECK-NEXT: vfsgnj.vv v16, v8, v16, v0.t +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB32_2: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vfabs.v v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v24, v16, fa5, v0.t ; CHECK-NEXT: fsrmi a0, 1 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vmv1r.v v0, v24 ; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll index 87b4e60aaca2c..191d932923eb5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll @@ -27,7 +27,7 @@ define void @rvv_vla(i64 %n, i64 %i) nounwind { ; CHECK-NEXT: vl2re64.v v8, (a2) ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: lw zero, 0(a0) ; CHECK-NEXT: addi sp, s0, -32 ; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -65,7 +65,7 @@ define void @rvv_overaligned() nounwind { ; CHECK-NEXT: vl1re64.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 112 ; CHECK-NEXT: vl2re64.v v8, (a0) -; CHECK-NEXT: lw a0, 64(sp) +; CHECK-NEXT: lw zero, 64(sp) ; CHECK-NEXT: addi sp, s0, -128 ; CHECK-NEXT: ld ra, 120(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 112(sp) # 8-byte Folded Reload @@ -109,10 +109,10 @@ define void @rvv_vla_and_overaligned(i64 %n, i64 %i) nounwind { ; CHECK-NEXT: vl1re64.v v8, (a2) ; CHECK-NEXT: addi a2, s1, 112 ; CHECK-NEXT: vl2re64.v v8, (a2) -; CHECK-NEXT: lw a2, 64(s1) +; CHECK-NEXT: lw zero, 64(s1) ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: lw zero, 0(a0) ; CHECK-NEXT: addi sp, s0, -144 ; CHECK-NEXT: ld ra, 136(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 128(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 848f65ba3f614..40f5aeb57d176 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -2241,13 +2241,13 @@ define @fcmp_oeq_vv_nxv32f64( %va, @fcmp_oeq_vv_nxv32f64( %va, @fcmp_oeq_vv_nxv32f64( %va, @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV32-NEXT: addi a0, a0, %lo(.LCPI15_0) ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV32-NEXT: vle16.v v16, (a0) -; RV32-NEXT: vmv2r.v v20, v10 +; RV32-NEXT: vle16.v v20, (a0) +; RV32-NEXT: vmv2r.v v16, v10 ; RV32-NEXT: vmv2r.v v12, v8 -; RV32-NEXT: vrgather.vv v8, v12, v16 +; RV32-NEXT: vrgather.vv v8, v12, v20 ; RV32-NEXT: vid.v v12 ; RV32-NEXT: vrsub.vi v12, v12, 15 ; RV32-NEXT: lui a0, 16 @@ -265,7 +265,7 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v0, a0 ; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV32-NEXT: vrgather.vv v8, v20, v12, v0.t +; RV32-NEXT: vrgather.vv v8, v16, v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: v16i16_2: @@ -274,10 +274,10 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV64-NEXT: addi a0, a0, %lo(.LCPI15_0) ; RV64-NEXT: li a1, 32 ; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; RV64-NEXT: vle16.v v16, (a0) -; RV64-NEXT: vmv2r.v v20, v10 +; RV64-NEXT: vle16.v v20, (a0) +; RV64-NEXT: vmv2r.v v16, v10 ; RV64-NEXT: vmv2r.v v12, v8 -; RV64-NEXT: vrgather.vv v8, v12, v16 +; RV64-NEXT: vrgather.vv v8, v12, v20 ; RV64-NEXT: vid.v v12 ; RV64-NEXT: vrsub.vi v12, v12, 15 ; RV64-NEXT: lui a0, 16 @@ -285,7 +285,7 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) { ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV64-NEXT: vrgather.vv v8, v20, v12, v0.t +; RV64-NEXT: vrgather.vv v8, v16, v12, v0.t ; RV64-NEXT: ret %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> ret <32 x i16> %v32i16 @@ -369,18 +369,18 @@ define <16 x i32> @v8i32_2(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: v8i32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv2r.v v16, v10 +; CHECK-NEXT: vmv2r.v v12, v8 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v18, v10, 15 +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrsub.vi v18, v14, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v18 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v8, v10, 7 +; CHECK-NEXT: vrsub.vi v12, v14, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t ; CHECK-NEXT: ret %v16i32 = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %v16i32 @@ -700,18 +700,18 @@ define <16 x float> @v8f32_2(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: v8f32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vmv2r.v v16, v10 +; CHECK-NEXT: vmv2r.v v12, v8 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v18, v10, 15 +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrsub.vi v18, v14, 15 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v18 +; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v8, v10, 7 +; CHECK-NEXT: vrsub.vi v12, v14, 7 ; CHECK-NEXT: li a0, 255 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vrgatherei16.vv v8, v16, v12, v0.t ; CHECK-NEXT: ret %v16f32 = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> ret <16 x float> %v16f32 diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index c4caf94bec6c6..63a85b1f4dc74 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -1459,42 +1459,42 @@ for.cond.cleanup: ; preds = %vector.body define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fmul_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB26_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB26_5 ; CHECK-NEXT: .LBB26_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB26_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfmul.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB26_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB26_7 ; CHECK-NEXT: .LBB26_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB26_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fmul.s fa5, fa5, fa0 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB26_6 +; CHECK-NEXT: bnez a1, .LBB26_6 ; CHECK-NEXT: .LBB26_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1549,42 +1549,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB27_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB27_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB27_5 ; CHECK-NEXT: .LBB27_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB27_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB27_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB27_7 ; CHECK-NEXT: .LBB27_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB27_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fdiv.s fa5, fa5, fa0 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB27_6 +; CHECK-NEXT: bnez a1, .LBB27_6 ; CHECK-NEXT: .LBB27_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1639,42 +1639,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB28_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB28_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB28_5 ; CHECK-NEXT: .LBB28_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB28_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfrdiv.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB28_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB28_7 ; CHECK-NEXT: .LBB28_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB28_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fdiv.s fa5, fa0, fa5 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB28_6 +; CHECK-NEXT: bnez a1, .LBB28_6 ; CHECK-NEXT: .LBB28_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1729,42 +1729,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fadd_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB29_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB29_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB29_5 ; CHECK-NEXT: .LBB29_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB29_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfadd.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB29_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB29_7 ; CHECK-NEXT: .LBB29_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB29_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fadd.s fa5, fa5, fa0 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB29_6 +; CHECK-NEXT: bnez a1, .LBB29_6 ; CHECK-NEXT: .LBB29_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1819,42 +1819,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB30_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB30_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB30_5 ; CHECK-NEXT: .LBB30_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB30_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB30_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB30_7 ; CHECK-NEXT: .LBB30_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB30_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fsub.s fa5, fa5, fa0 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB30_6 +; CHECK-NEXT: bnez a1, .LBB30_6 ; CHECK-NEXT: .LBB30_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1909,42 +1909,42 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB31_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a3, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a3, .LBB31_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB31_5 ; CHECK-NEXT: .LBB31_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: addiw a2, a3, -1 +; CHECK-NEXT: andi a4, a2, 1024 +; CHECK-NEXT: xori a2, a4, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 +; CHECK-NEXT: mv a6, a2 ; CHECK-NEXT: .LBB31_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfrsub.vf v8, v8, fa0 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB31_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB31_7 ; CHECK-NEXT: .LBB31_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB31_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: fsub.s fa5, fa0, fa5 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB31_6 +; CHECK-NEXT: bnez a1, .LBB31_6 ; CHECK-NEXT: .LBB31_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2075,48 +2075,48 @@ for.cond.cleanup: ; preds = %vector.body define void @sink_splat_fma_scalable(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, float %x) { ; CHECK-LABEL: sink_splat_fma_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: srli a4, a3, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a4, .LBB34_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a4, 1024 +; CHECK-NEXT: bgeu a4, a3, .LBB34_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: j .LBB34_5 ; CHECK-NEXT: .LBB34_2: # %vector.ph -; CHECK-NEXT: addiw a2, a4, -1 -; CHECK-NEXT: andi a5, a2, 1024 -; CHECK-NEXT: xori a2, a5, 1024 +; CHECK-NEXT: addiw a4, a3, -1 +; CHECK-NEXT: andi a5, a4, 1024 +; CHECK-NEXT: xori a4, a5, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: mv t0, a2 +; CHECK-NEXT: mv t0, a4 ; CHECK-NEXT: .LBB34_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a6) ; CHECK-NEXT: vl1re32.v v9, (a7) ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vs1r.v v9, (a6) -; CHECK-NEXT: sub t0, t0, a4 -; CHECK-NEXT: add a7, a7, a3 -; CHECK-NEXT: add a6, a6, a3 +; CHECK-NEXT: sub t0, t0, a3 +; CHECK-NEXT: add a7, a7, a2 +; CHECK-NEXT: add a6, a6, a2 ; CHECK-NEXT: bnez t0, .LBB34_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a5, .LBB34_7 ; CHECK-NEXT: .LBB34_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a4, -1024 +; CHECK-NEXT: slli a4, a4, 2 +; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: .LBB34_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: flw fa4, 0(a1) ; CHECK-NEXT: fmadd.s fa5, fa5, fa0, fa4 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a1, a1, 4 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB34_6 +; CHECK-NEXT: bnez a2, .LBB34_6 ; CHECK-NEXT: .LBB34_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2175,48 +2175,48 @@ for.body: ; preds = %for.body.preheader, define void @sink_splat_fma_commute_scalable(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, float %x) { ; CHECK-LABEL: sink_splat_fma_commute_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: srli a4, a3, 2 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a4, .LBB35_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a3, a2, 2 +; CHECK-NEXT: li a4, 1024 +; CHECK-NEXT: bgeu a4, a3, .LBB35_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a4, 0 ; CHECK-NEXT: j .LBB35_5 ; CHECK-NEXT: .LBB35_2: # %vector.ph -; CHECK-NEXT: addiw a2, a4, -1 -; CHECK-NEXT: andi a5, a2, 1024 -; CHECK-NEXT: xori a2, a5, 1024 +; CHECK-NEXT: addiw a4, a3, -1 +; CHECK-NEXT: andi a5, a4, 1024 +; CHECK-NEXT: xori a4, a5, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: mv t0, a2 +; CHECK-NEXT: mv t0, a4 ; CHECK-NEXT: .LBB35_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a6) ; CHECK-NEXT: vl1re32.v v9, (a7) ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vs1r.v v9, (a6) -; CHECK-NEXT: sub t0, t0, a4 -; CHECK-NEXT: add a7, a7, a3 -; CHECK-NEXT: add a6, a6, a3 +; CHECK-NEXT: sub t0, t0, a3 +; CHECK-NEXT: add a7, a7, a2 +; CHECK-NEXT: add a6, a6, a2 ; CHECK-NEXT: bnez t0, .LBB35_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a5, .LBB35_7 ; CHECK-NEXT: .LBB35_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a1, a1, a2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a4, -1024 +; CHECK-NEXT: slli a4, a4, 2 +; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: .LBB35_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw fa5, 0(a0) ; CHECK-NEXT: flw fa4, 0(a1) ; CHECK-NEXT: fmadd.s fa5, fa0, fa5, fa4 ; CHECK-NEXT: fsw fa5, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a1, a1, 4 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB35_6 +; CHECK-NEXT: bnez a2, .LBB35_6 ; CHECK-NEXT: .LBB35_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store-intrinsics.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store-intrinsics.ll index 06af0fc8971a5..f8a5b0f9d03a6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store-intrinsics.ll @@ -89,9 +89,8 @@ define void @strided_store_i8_nostride(ptr %p, <32 x i8> %v, <32 x i1> %m) { ; CHECK-LABEL: strided_store_i8_nostride: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: li a2, 1 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vsse8.v v8, (a0), a2, v0.t +; CHECK-NEXT: vse8.v v8, (a0), v0.t ; CHECK-NEXT: ret call void @llvm.riscv.masked.strided.store.v32i8.p0.i64(<32 x i8> %v, ptr %p, i64 1, <32 x i1> %m) ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll index c023c6a86f8b6..47074d612bb64 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -1,23 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32 +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: -check-prefixes=CHECK,CHECK-RV32 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64 +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: -check-prefixes=CHECK,CHECK-RV64 declare @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, , i32) define @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1i8_i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1i8_i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1i8_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 %stride, %m, i32 %evl) ret %load } @@ -25,17 +21,11 @@ define @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride, declare @llvm.experimental.vp.strided.load.nxv1i8.p0.i16(ptr, i16, , i32) define @strided_vpload_nxv1i8_i16(ptr %ptr, i16 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1i8_i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1i8_i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1i8_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1i8.p0.i16(ptr %ptr, i16 %stride, %m, i32 %evl) ret %load } @@ -79,33 +69,21 @@ define @strided_vpload_nxv1i8_i64_allones_mask(ptr %ptr, i64 s declare @llvm.experimental.vp.strided.load.nxv1i8.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv1i8(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1i8.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } define @strided_vpload_nxv1i8_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1i8_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1i8_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer %load = call @llvm.experimental.vp.strided.load.nxv1i8.p0.i32(ptr %ptr, i32 signext %stride, %b, i32 %evl) @@ -115,17 +93,11 @@ define @strided_vpload_nxv1i8_allones_mask(ptr %ptr, i32 signe declare @llvm.experimental.vp.strided.load.nxv2i8.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv2i8(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf4, ta, ma -; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf4, ta, ma -; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv2i8.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -133,17 +105,11 @@ define @strided_vpload_nxv2i8(ptr %ptr, i32 signext %stride, < declare @llvm.experimental.vp.strided.load.nxv4i8.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv4i8(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma -; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma -; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv4i8.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -151,33 +117,31 @@ define @strided_vpload_nxv4i8(ptr %ptr, i32 signext %stride, < declare @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv8i8(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m1, ta, ma -; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m1, ta, ma -; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } +define @strided_vpload_nxv8i8_unit_stride(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_nxv8i8_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr %ptr, i32 1, %m, i32 %evl) + ret %load +} + define @strided_vpload_nxv8i8_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8i8_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m1, ta, ma -; CHECK-RV32-NEXT: vlse8.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8i8_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m1, ta, ma -; CHECK-RV64-NEXT: vlse8.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer %load = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr %ptr, i32 signext %stride, %b, i32 %evl) @@ -187,17 +151,11 @@ define @strided_vpload_nxv8i8_allones_mask(ptr %ptr, i32 signe declare @llvm.experimental.vp.strided.load.nxv1i16.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv1i16(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf4, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf4, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1i16.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -205,33 +163,21 @@ define @strided_vpload_nxv1i16(ptr %ptr, i32 signext %stride, declare @llvm.experimental.vp.strided.load.nxv2i16.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv2i16(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv2i16.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } define @strided_vpload_nxv2i16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2i16_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2i16_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer %load = call @llvm.experimental.vp.strided.load.nxv2i16.p0.i32(ptr %ptr, i32 signext %stride, %b, i32 %evl) @@ -241,35 +187,33 @@ define @strided_vpload_nxv2i16_allones_mask(ptr %ptr, i32 sig declare @llvm.experimental.vp.strided.load.nxv4i16.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv4i16(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m1, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m1, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv4i16.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } +define @strided_vpload_nxv4i16_unit_stride(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_nxv4i16_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv4i16.p0.i32(ptr %ptr, i32 2, %m, i32 %evl) + ret %load +} + declare @llvm.experimental.vp.strided.load.nxv8i16.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv8i16(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m2, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m2, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv8i16.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -277,17 +221,11 @@ define @strided_vpload_nxv8i16(ptr %ptr, i32 signext %stride, declare @llvm.experimental.vp.strided.load.nxv1i32.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv1i32(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1i32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1i32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, mf2, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1i32.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -295,51 +233,43 @@ define @strided_vpload_nxv1i32(ptr %ptr, i32 signext %stride, declare @llvm.experimental.vp.strided.load.nxv2i32.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv2i32(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2i32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2i32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv2i32.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } +define @strided_vpload_nxv2i32_unit_stride(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_nxv2i32_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv2i32.p0.i32(ptr %ptr, i32 4, %m, i32 %evl) + ret %load +} + declare @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv4i32(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4i32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4i32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } define @strided_vpload_nxv4i32_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4i32_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4i32_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer %load = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr %ptr, i32 signext %stride, %b, i32 %evl) @@ -349,17 +279,11 @@ define @strided_vpload_nxv4i32_allones_mask(ptr %ptr, i32 sig declare @llvm.experimental.vp.strided.load.nxv8i32.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv8i32(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8i32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8i32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv8i32.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -367,33 +291,31 @@ define @strided_vpload_nxv8i32(ptr %ptr, i32 signext %stride, declare @llvm.experimental.vp.strided.load.nxv1i64.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv1i64(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } +define @strided_vpload_nxv1i64_unit_stride(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_nxv1i64_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i32(ptr %ptr, i32 8, %m, i32 %evl) + ret %load +} + define @strided_vpload_nxv1i64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1i64_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1i64_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer %load = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i32(ptr %ptr, i32 signext %stride, %b, i32 %evl) @@ -403,17 +325,11 @@ define @strided_vpload_nxv1i64_allones_mask(ptr %ptr, i32 sig declare @llvm.experimental.vp.strided.load.nxv2i64.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv2i64(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m2, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -421,17 +337,11 @@ define @strided_vpload_nxv2i64(ptr %ptr, i32 signext %stride, declare @llvm.experimental.vp.strided.load.nxv4i64.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv4i64(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv4i64.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -439,17 +349,11 @@ define @strided_vpload_nxv4i64(ptr %ptr, i32 signext %stride, declare @llvm.experimental.vp.strided.load.nxv8i64.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv8i64(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv8i64.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -457,17 +361,11 @@ define @strided_vpload_nxv8i64(ptr %ptr, i32 signext %stride, declare @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv1f16(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1f16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf4, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1f16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf4, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -475,33 +373,21 @@ define @strided_vpload_nxv1f16(ptr %ptr, i32 signext %stride declare @llvm.experimental.vp.strided.load.nxv2f16.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv2f16(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2f16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2f16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv2f16.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } define @strided_vpload_nxv2f16_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2f16_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2f16_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer %load = call @llvm.experimental.vp.strided.load.nxv2f16.p0.i32(ptr %ptr, i32 signext %stride, %b, i32 %evl) @@ -511,35 +397,33 @@ define @strided_vpload_nxv2f16_allones_mask(ptr %ptr, i32 si declare @llvm.experimental.vp.strided.load.nxv4f16.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv4f16(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4f16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m1, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4f16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m1, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv4f16.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } +define @strided_vpload_nxv4f16_unit_stride(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_nxv4f16_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv4f16.p0.i32(ptr %ptr, i32 2, %m, i32 %evl) + ret %load +} + declare @llvm.experimental.vp.strided.load.nxv8f16.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv8f16(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8f16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m2, ta, ma -; CHECK-RV32-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8f16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m2, ta, ma -; CHECK-RV64-NEXT: vlse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv8f16.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -547,17 +431,11 @@ define @strided_vpload_nxv8f16(ptr %ptr, i32 signext %stride declare @llvm.experimental.vp.strided.load.nxv1f32.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv1f32(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, mf2, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1f32.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -565,35 +443,33 @@ define @strided_vpload_nxv1f32(ptr %ptr, i32 signext %strid declare @llvm.experimental.vp.strided.load.nxv2f32.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv2f32(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv2f32.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } +define @strided_vpload_nxv2f32_unit_stride(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_nxv2f32_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv2f32.p0.i32(ptr %ptr, i32 4, %m, i32 %evl) + ret %load +} + declare @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv4f32(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv4f32.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -601,33 +477,21 @@ define @strided_vpload_nxv4f32(ptr %ptr, i32 signext %strid declare @llvm.experimental.vp.strided.load.nxv8f32.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv8f32(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv8f32.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } define @strided_vpload_nxv8f32_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8f32_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV32-NEXT: vlse32.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8f32_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV64-NEXT: vlse32.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vlse32.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer %load = call @llvm.experimental.vp.strided.load.nxv8f32.p0.i32(ptr %ptr, i32 signext %stride, %b, i32 %evl) @@ -637,35 +501,33 @@ define @strided_vpload_nxv8f32_allones_mask(ptr %ptr, i32 s declare @llvm.experimental.vp.strided.load.nxv1f64.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv1f64(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv1f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv1f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv1f64.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } +define @strided_vpload_nxv1f64_unit_stride(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpload_nxv1f64_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv1f64.p0.i32(ptr %ptr, i32 8, %m, i32 %evl) + ret %load +} + declare @llvm.experimental.vp.strided.load.nxv2f64.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv2f64(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv2f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv2f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m2, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv2f64.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } @@ -673,33 +535,21 @@ define @strided_vpload_nxv2f64(ptr %ptr, i32 signext %stri declare @llvm.experimental.vp.strided.load.nxv4f64.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv4f64(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv4f64.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } define @strided_vpload_nxv4f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv4f64_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv4f64_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv4f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer %load = call @llvm.experimental.vp.strided.load.nxv4f64.p0.i32(ptr %ptr, i32 signext %stride, %b, i32 %evl) @@ -709,50 +559,32 @@ define @strided_vpload_nxv4f64_allones_mask(ptr %ptr, i32 declare @llvm.experimental.vp.strided.load.nxv8f64.p0.i32(ptr, i32, , i32) define @strided_vpload_nxv8f64(ptr %ptr, i32 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv8f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv8f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %load = call @llvm.experimental.vp.strided.load.nxv8f64.p0.i32(ptr %ptr, i32 signext %stride, %m, i32 %evl) ret %load } ; Widening define @strided_vpload_nxv3f64(ptr %ptr, i32 signext %stride, %mask, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv3f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv3f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv3f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret %v = call @llvm.experimental.vp.strided.load.nxv3f64.p0.i32(ptr %ptr, i32 %stride, %mask, i32 %evl) ret %v } define @strided_vpload_nxv3f64_allones_mask(ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpload_nxv3f64_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV32-NEXT: vlse64.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpload_nxv3f64_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV64-NEXT: vlse64.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpload_nxv3f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vlse64.v v8, (a0), a1 +; CHECK-NEXT: ret %one = insertelement poison, i1 true, i32 0 %allones = shufflevector %one, poison, zeroinitializer %v = call @llvm.experimental.vp.strided.load.nxv3f64.p0.i32(ptr %ptr, i32 %stride, %allones, i32 %evl) @@ -771,10 +603,10 @@ define @strided_load_nxv16f64(ptr %ptr, i64 %stride, @strided_load_nxv16f64(ptr %ptr, i64 %stride, @strided_load_nxv16f64_allones_mask(ptr %ptr, i64 ; CHECK-RV32-NEXT: sltu a5, a3, a2 ; CHECK-RV32-NEXT: addi a5, a5, -1 ; CHECK-RV32-NEXT: and a2, a5, a2 -; CHECK-RV32-NEXT: bltu a3, a4, .LBB43_2 +; CHECK-RV32-NEXT: bltu a3, a4, .LBB50_2 ; CHECK-RV32-NEXT: # %bb.1: ; CHECK-RV32-NEXT: mv a3, a4 -; CHECK-RV32-NEXT: .LBB43_2: +; CHECK-RV32-NEXT: .LBB50_2: ; CHECK-RV32-NEXT: mul a4, a3, a1 ; CHECK-RV32-NEXT: add a4, a0, a4 ; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma @@ -843,10 +675,10 @@ define @strided_load_nxv16f64_allones_mask(ptr %ptr, i64 ; CHECK-RV64-NEXT: sltu a5, a2, a3 ; CHECK-RV64-NEXT: addi a5, a5, -1 ; CHECK-RV64-NEXT: and a3, a5, a3 -; CHECK-RV64-NEXT: bltu a2, a4, .LBB43_2 +; CHECK-RV64-NEXT: bltu a2, a4, .LBB50_2 ; CHECK-RV64-NEXT: # %bb.1: ; CHECK-RV64-NEXT: mv a2, a4 -; CHECK-RV64-NEXT: .LBB43_2: +; CHECK-RV64-NEXT: .LBB50_2: ; CHECK-RV64-NEXT: mul a4, a2, a1 ; CHECK-RV64-NEXT: add a4, a0, a4 ; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma @@ -873,19 +705,19 @@ define @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, @strided_load_nxv17f64(ptr %ptr, i64 %stride, , ptr, i8, , i32) define void @strided_vpstore_nxv1i8_i8( %val, ptr %ptr, i8 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1i8_i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1i8_i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1i8_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1i8.p0.i8( %val, ptr %ptr, i8 %stride, %m, i32 %evl) ret void } @@ -25,17 +21,11 @@ define void @strided_vpstore_nxv1i8_i8( %val, ptr %ptr, i8 sign declare void @llvm.experimental.vp.strided.store.nxv1i8.p0.i16(, ptr, i16, , i32) define void @strided_vpstore_nxv1i8_i16( %val, ptr %ptr, i16 signext %stride, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1i8_i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1i8_i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1i8_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1i8.p0.i16( %val, ptr %ptr, i16 %stride, %m, i32 %evl) ret void } @@ -61,17 +51,11 @@ define void @strided_vpstore_nxv1i8_i64( %val, ptr %ptr, i64 si declare void @llvm.experimental.vp.strided.store.nxv1i8.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv1i8( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1i8.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -79,17 +63,11 @@ define void @strided_vpstore_nxv1i8( %val, ptr %ptr, i32 signex declare void @llvm.experimental.vp.strided.store.nxv2i8.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv2i8( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv2i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf4, ta, ma -; CHECK-RV32-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv2i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf4, ta, ma -; CHECK-RV64-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf4, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv2i8.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -97,17 +75,11 @@ define void @strided_vpstore_nxv2i8( %val, ptr %ptr, i32 signex declare void @llvm.experimental.vp.strided.store.nxv4i8.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv4i8( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv4i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma -; CHECK-RV32-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv4i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma -; CHECK-RV64-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv4i8.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -115,35 +87,33 @@ define void @strided_vpstore_nxv4i8( %val, ptr %ptr, i32 signex declare void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv8i8( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv8i8: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, m1, ta, ma -; CHECK-RV32-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv8i8: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, m1, ta, ma -; CHECK-RV64-NEXT: vsse8.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } +define void @strided_vpstore_nxv8i8_unit_stride( %val, ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_nxv8i8_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vse8.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32( %val, ptr %ptr, i32 1, %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.nxv1i16.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv1i16( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf4, ta, ma -; CHECK-RV32-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf4, ta, ma -; CHECK-RV64-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1i16.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -151,17 +121,11 @@ define void @strided_vpstore_nxv1i16( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv2i16.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv2i16( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv2i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV32-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv2i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV64-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv2i16.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -169,35 +133,33 @@ define void @strided_vpstore_nxv2i16( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv4i16.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv4i16( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv4i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m1, ta, ma -; CHECK-RV32-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv4i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m1, ta, ma -; CHECK-RV64-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv4i16.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } +define void @strided_vpstore_nxv4i16_unit_stride( %val, ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_nxv4i16_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv4i16.p0.i32( %val, ptr %ptr, i32 2, %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.nxv8i16.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv8i16( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv8i16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m2, ta, ma -; CHECK-RV32-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv8i16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m2, ta, ma -; CHECK-RV64-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv8i16.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -205,17 +167,11 @@ define void @strided_vpstore_nxv8i16( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv1i32.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv1i32( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1i32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1i32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, mf2, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1i32.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -223,17 +179,11 @@ define void @strided_vpstore_nxv1i32( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv2i32.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv2i32( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv2i32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv2i32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv2i32.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -241,35 +191,33 @@ define void @strided_vpstore_nxv2i32( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv4i32( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv4i32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv4i32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } +define void @strided_vpstore_nxv4i32_unit_stride( %val, ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_nxv4i32_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( %val, ptr %ptr, i32 4, %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.nxv8i32.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv8i32( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv8i32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv8i32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv8i32.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -277,35 +225,33 @@ define void @strided_vpstore_nxv8i32( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv1i64.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv1i64( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } +define void @strided_vpstore_nxv1i64_unit_stride( %val, ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_nxv1i64_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i32( %val, ptr %ptr, i32 8, %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv2i64( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv2i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv2i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m2, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -313,17 +259,11 @@ define void @strided_vpstore_nxv2i64( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv4i64.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv4i64( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv4i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv4i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -331,17 +271,11 @@ define void @strided_vpstore_nxv4i64( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv8i64.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv8i64( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv8i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv8i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv8i64.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -349,17 +283,11 @@ define void @strided_vpstore_nxv8i64( %val, ptr %ptr, i32 sign declare void @llvm.experimental.vp.strided.store.nxv1f16.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv1f16( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1f16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf4, ta, ma -; CHECK-RV32-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1f16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf4, ta, ma -; CHECK-RV64-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf4, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1f16.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -367,17 +295,11 @@ define void @strided_vpstore_nxv1f16( %val, ptr %ptr, i32 sig declare void @llvm.experimental.vp.strided.store.nxv2f16.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv2f16( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv2f16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV32-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv2f16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, mf2, ta, ma -; CHECK-RV64-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv2f16.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -385,35 +307,33 @@ define void @strided_vpstore_nxv2f16( %val, ptr %ptr, i32 sig declare void @llvm.experimental.vp.strided.store.nxv4f16.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv4f16( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv4f16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m1, ta, ma -; CHECK-RV32-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv4f16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m1, ta, ma -; CHECK-RV64-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv4f16.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } +define void @strided_vpstore_nxv4f16_unit_stride( %val, ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_nxv4f16_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vse16.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv4f16.p0.i32( %val, ptr %ptr, i32 2, %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.nxv8f16.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv8f16( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv8f16: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e16, m2, ta, ma -; CHECK-RV32-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv8f16: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m2, ta, ma -; CHECK-RV64-NEXT: vsse16.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, ma +; CHECK-NEXT: vsse16.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv8f16.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -421,17 +341,11 @@ define void @strided_vpstore_nxv8f16( %val, ptr %ptr, i32 sig declare void @llvm.experimental.vp.strided.store.nxv1f32.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv1f32( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, mf2, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, mf2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1f32.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -439,17 +353,11 @@ define void @strided_vpstore_nxv1f32( %val, ptr %ptr, i32 si declare void @llvm.experimental.vp.strided.store.nxv2f32.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv2f32( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv2f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv2f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv2f32.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -457,35 +365,33 @@ define void @strided_vpstore_nxv2f32( %val, ptr %ptr, i32 si declare void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv4f32( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv4f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv4f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } +define void @strided_vpstore_nxv4f32_unit_stride( %val, ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_nxv4f32_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vse32.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32( %val, ptr %ptr, i32 4, %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.nxv8f32.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv8f32( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv8f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv8f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv8f32.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -493,35 +399,33 @@ define void @strided_vpstore_nxv8f32( %val, ptr %ptr, i32 si declare void @llvm.experimental.vp.strided.store.nxv1f64.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv1f64( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv1f64.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } +define void @strided_vpstore_nxv1f64_unit_stride( %val, ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: strided_vpstore_nxv1f64_unit_stride: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v8, (a0), v0.t +; CHECK-NEXT: ret + call void @llvm.experimental.vp.strided.store.nxv1f64.p0.i32( %val, ptr %ptr, i32 8, %m, i32 %evl) + ret void +} + declare void @llvm.experimental.vp.strided.store.nxv2f64.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv2f64( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv2f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv2f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m2, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv2f64.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -529,17 +433,11 @@ define void @strided_vpstore_nxv2f64( %val, ptr %ptr, i32 s declare void @llvm.experimental.vp.strided.store.nxv4f64.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv4f64( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv4f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv4f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m4, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv4f64.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } @@ -547,33 +445,21 @@ define void @strided_vpstore_nxv4f64( %val, ptr %ptr, i32 s declare void @llvm.experimental.vp.strided.store.nxv8f64.p0.i32(, ptr, i32, , i32) define void @strided_vpstore_nxv8f64( %val, ptr %ptr, i32 signext %strided, %m, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv8f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv8f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv8f64.p0.i32( %val, ptr %ptr, i32 %strided, %m, i32 %evl) ret void } define void @strided_vpstore_nxv1i8_allones_mask( %val, ptr %ptr, i32 signext %strided, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv1i8_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV32-NEXT: vsse8.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv1i8_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e8, mf8, ta, ma -; CHECK-RV64-NEXT: vsse8.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv1i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e8, mf8, ta, ma +; CHECK-NEXT: vsse8.v v8, (a0), a1 +; CHECK-NEXT: ret %a = insertelement poison, i1 true, i32 0 %b = shufflevector %a, poison, zeroinitializer call void @llvm.experimental.vp.strided.store.nxv1i8.p0.i32( %val, ptr %ptr, i32 %strided, %b, i32 %evl) @@ -582,33 +468,21 @@ define void @strided_vpstore_nxv1i8_allones_mask( %val, ptr %pt ; Widening define void @strided_vpstore_nxv3f32( %v, ptr %ptr, i32 signext %stride, %mask, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv3f32: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv3f32: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv3f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv3f32.p0.i32( %v, ptr %ptr, i32 %stride, %mask, i32 %evl) ret void } define void @strided_vpstore_nxv3f32_allones_mask( %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_vpstore_nxv3f32_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV32-NEXT: vsse32.v v8, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_vpstore_nxv3f32_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetvli zero, a2, e32, m2, ta, ma -; CHECK-RV64-NEXT: vsse32.v v8, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_vpstore_nxv3f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; CHECK-NEXT: vsse32.v v8, (a0), a1 +; CHECK-NEXT: ret %one = insertelement poison, i1 true, i32 0 %allones = shufflevector %one, poison, zeroinitializer call void @llvm.experimental.vp.strided.store.nxv3f32.p0.i32( %v, ptr %ptr, i32 %stride, %allones, i32 %evl) @@ -619,95 +493,52 @@ declare void @llvm.experimental.vp.strided.store.nxv3f32.p0.i32( %v, ptr %ptr, i32 signext %stride, %mask, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_store_nxv16f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: csrr a3, vlenb -; CHECK-RV32-NEXT: mv a4, a2 -; CHECK-RV32-NEXT: bltu a2, a3, .LBB34_2 -; CHECK-RV32-NEXT: # %bb.1: -; CHECK-RV32-NEXT: mv a4, a3 -; CHECK-RV32-NEXT: .LBB34_2: -; CHECK-RV32-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV32-NEXT: sub a5, a2, a3 -; CHECK-RV32-NEXT: sltu a2, a2, a5 -; CHECK-RV32-NEXT: addi a2, a2, -1 -; CHECK-RV32-NEXT: and a2, a2, a5 -; CHECK-RV32-NEXT: mul a4, a4, a1 -; CHECK-RV32-NEXT: add a0, a0, a4 -; CHECK-RV32-NEXT: srli a3, a3, 3 -; CHECK-RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; CHECK-RV32-NEXT: vslidedown.vx v0, v0, a3 -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1, v0.t -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_store_nxv16f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: csrr a3, vlenb -; CHECK-RV64-NEXT: mv a4, a2 -; CHECK-RV64-NEXT: bltu a2, a3, .LBB34_2 -; CHECK-RV64-NEXT: # %bb.1: -; CHECK-RV64-NEXT: mv a4, a3 -; CHECK-RV64-NEXT: .LBB34_2: -; CHECK-RV64-NEXT: vsetvli zero, a4, e64, m8, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1, v0.t -; CHECK-RV64-NEXT: sub a5, a2, a3 -; CHECK-RV64-NEXT: sltu a2, a2, a5 -; CHECK-RV64-NEXT: addi a2, a2, -1 -; CHECK-RV64-NEXT: and a2, a2, a5 -; CHECK-RV64-NEXT: mul a4, a4, a1 -; CHECK-RV64-NEXT: add a0, a0, a4 -; CHECK-RV64-NEXT: srli a3, a3, 3 -; CHECK-RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma -; CHECK-RV64-NEXT: vslidedown.vx v0, v0, a3 -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1, v0.t -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_store_nxv16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: bltu a2, a3, .LBB41_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a4, a3 +; CHECK-NEXT: .LBB41_2: +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t +; CHECK-NEXT: sub a5, a2, a3 +; CHECK-NEXT: sltu a2, a2, a5 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a5 +; CHECK-NEXT: mul a4, a4, a1 +; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: srli a3, a3, 3 +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a3 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v16, (a0), a1, v0.t +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32( %v, ptr %ptr, i32 %stride, %mask, i32 %evl) ret void } define void @strided_store_nxv16f64_allones_mask( %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_store_nxv16f64_allones_mask: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: csrr a4, vlenb -; CHECK-RV32-NEXT: mv a3, a2 -; CHECK-RV32-NEXT: bltu a2, a4, .LBB35_2 -; CHECK-RV32-NEXT: # %bb.1: -; CHECK-RV32-NEXT: mv a3, a4 -; CHECK-RV32-NEXT: .LBB35_2: -; CHECK-RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-RV32-NEXT: vsse64.v v8, (a0), a1 -; CHECK-RV32-NEXT: sub a4, a2, a4 -; CHECK-RV32-NEXT: sltu a2, a2, a4 -; CHECK-RV32-NEXT: addi a2, a2, -1 -; CHECK-RV32-NEXT: and a2, a2, a4 -; CHECK-RV32-NEXT: mul a3, a3, a1 -; CHECK-RV32-NEXT: add a0, a0, a3 -; CHECK-RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV32-NEXT: vsse64.v v16, (a0), a1 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_store_nxv16f64_allones_mask: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: csrr a4, vlenb -; CHECK-RV64-NEXT: mv a3, a2 -; CHECK-RV64-NEXT: bltu a2, a4, .LBB35_2 -; CHECK-RV64-NEXT: # %bb.1: -; CHECK-RV64-NEXT: mv a3, a4 -; CHECK-RV64-NEXT: .LBB35_2: -; CHECK-RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; CHECK-RV64-NEXT: vsse64.v v8, (a0), a1 -; CHECK-RV64-NEXT: sub a4, a2, a4 -; CHECK-RV64-NEXT: sltu a2, a2, a4 -; CHECK-RV64-NEXT: addi a2, a2, -1 -; CHECK-RV64-NEXT: and a2, a2, a4 -; CHECK-RV64-NEXT: mul a3, a3, a1 -; CHECK-RV64-NEXT: add a0, a0, a3 -; CHECK-RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-RV64-NEXT: vsse64.v v16, (a0), a1 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_store_nxv16f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: bltu a2, a3, .LBB42_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a4, a3 +; CHECK-NEXT: .LBB42_2: +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v8, (a0), a1 +; CHECK-NEXT: sub a3, a2, a3 +; CHECK-NEXT: sltu a2, a2, a3 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: and a2, a2, a3 +; CHECK-NEXT: mul a3, a4, a1 +; CHECK-NEXT: add a0, a0, a3 +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vsse64.v v16, (a0), a1 +; CHECK-NEXT: ret %one = insertelement poison, i1 true, i32 0 %allones = shufflevector %one, poison, zeroinitializer call void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32( %v, ptr %ptr, i32 %stride, %allones, i32 %evl) @@ -718,127 +549,66 @@ declare void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32( %v, ptr %ptr, i32 signext %stride, %mask, i32 zeroext %evl) { -; CHECK-RV32-LABEL: strided_store_nxv17f64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: csrr a4, vlenb -; CHECK-RV32-NEXT: slli a6, a4, 1 -; CHECK-RV32-NEXT: vmv1r.v v24, v0 -; CHECK-RV32-NEXT: mv a5, a3 -; CHECK-RV32-NEXT: bltu a3, a6, .LBB36_2 -; CHECK-RV32-NEXT: # %bb.1: -; CHECK-RV32-NEXT: mv a5, a6 -; CHECK-RV32-NEXT: .LBB36_2: -; CHECK-RV32-NEXT: mv a7, a5 -; CHECK-RV32-NEXT: bltu a5, a4, .LBB36_4 -; CHECK-RV32-NEXT: # %bb.3: -; CHECK-RV32-NEXT: mv a7, a4 -; CHECK-RV32-NEXT: .LBB36_4: -; CHECK-RV32-NEXT: addi sp, sp, -16 -; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 -; CHECK-RV32-NEXT: csrr t0, vlenb -; CHECK-RV32-NEXT: slli t0, t0, 3 -; CHECK-RV32-NEXT: sub sp, sp, t0 -; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-RV32-NEXT: vl8re64.v v0, (a0) -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: vsetvli zero, a7, e64, m8, ta, ma -; CHECK-RV32-NEXT: vmv1r.v v0, v24 -; CHECK-RV32-NEXT: vsse64.v v8, (a1), a2, v0.t -; CHECK-RV32-NEXT: sub a0, a5, a4 -; CHECK-RV32-NEXT: sltu t0, a5, a0 -; CHECK-RV32-NEXT: addi t0, t0, -1 -; CHECK-RV32-NEXT: and a0, t0, a0 -; CHECK-RV32-NEXT: mul a7, a7, a2 -; CHECK-RV32-NEXT: add a7, a1, a7 -; CHECK-RV32-NEXT: srli t0, a4, 3 -; CHECK-RV32-NEXT: vsetvli t1, zero, e8, mf4, ta, ma -; CHECK-RV32-NEXT: vslidedown.vx v0, v24, t0 -; CHECK-RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-RV32-NEXT: sub a0, a3, a6 -; CHECK-RV32-NEXT: sltu a3, a3, a0 -; CHECK-RV32-NEXT: addi a3, a3, -1 -; CHECK-RV32-NEXT: and a0, a3, a0 -; CHECK-RV32-NEXT: vsse64.v v16, (a7), a2, v0.t -; CHECK-RV32-NEXT: bltu a0, a4, .LBB36_6 -; CHECK-RV32-NEXT: # %bb.5: -; CHECK-RV32-NEXT: mv a0, a4 -; CHECK-RV32-NEXT: .LBB36_6: -; CHECK-RV32-NEXT: mul a3, a5, a2 -; CHECK-RV32-NEXT: add a1, a1, a3 -; CHECK-RV32-NEXT: srli a4, a4, 2 -; CHECK-RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; CHECK-RV32-NEXT: vslidedown.vx v0, v24, a4 -; CHECK-RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV32-NEXT: vsse64.v v8, (a1), a2, v0.t -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 -; CHECK-RV32-NEXT: add sp, sp, a0 -; CHECK-RV32-NEXT: addi sp, sp, 16 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: strided_store_nxv17f64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: csrr a4, vlenb -; CHECK-RV64-NEXT: slli a6, a4, 1 -; CHECK-RV64-NEXT: vmv1r.v v24, v0 -; CHECK-RV64-NEXT: mv a5, a3 -; CHECK-RV64-NEXT: bltu a3, a6, .LBB36_2 -; CHECK-RV64-NEXT: # %bb.1: -; CHECK-RV64-NEXT: mv a5, a6 -; CHECK-RV64-NEXT: .LBB36_2: -; CHECK-RV64-NEXT: mv a7, a5 -; CHECK-RV64-NEXT: bltu a5, a4, .LBB36_4 -; CHECK-RV64-NEXT: # %bb.3: -; CHECK-RV64-NEXT: mv a7, a4 -; CHECK-RV64-NEXT: .LBB36_4: -; CHECK-RV64-NEXT: addi sp, sp, -16 -; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 -; CHECK-RV64-NEXT: csrr t0, vlenb -; CHECK-RV64-NEXT: slli t0, t0, 3 -; CHECK-RV64-NEXT: sub sp, sp, t0 -; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-RV64-NEXT: vl8re64.v v0, (a0) -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: vsetvli zero, a7, e64, m8, ta, ma -; CHECK-RV64-NEXT: vmv1r.v v0, v24 -; CHECK-RV64-NEXT: vsse64.v v8, (a1), a2, v0.t -; CHECK-RV64-NEXT: sub a0, a5, a4 -; CHECK-RV64-NEXT: sltu t0, a5, a0 -; CHECK-RV64-NEXT: addi t0, t0, -1 -; CHECK-RV64-NEXT: and a0, t0, a0 -; CHECK-RV64-NEXT: mul a7, a7, a2 -; CHECK-RV64-NEXT: add a7, a1, a7 -; CHECK-RV64-NEXT: srli t0, a4, 3 -; CHECK-RV64-NEXT: vsetvli t1, zero, e8, mf4, ta, ma -; CHECK-RV64-NEXT: vslidedown.vx v0, v24, t0 -; CHECK-RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-RV64-NEXT: sub a0, a3, a6 -; CHECK-RV64-NEXT: sltu a3, a3, a0 -; CHECK-RV64-NEXT: addi a3, a3, -1 -; CHECK-RV64-NEXT: and a0, a3, a0 -; CHECK-RV64-NEXT: vsse64.v v16, (a7), a2, v0.t -; CHECK-RV64-NEXT: bltu a0, a4, .LBB36_6 -; CHECK-RV64-NEXT: # %bb.5: -; CHECK-RV64-NEXT: mv a0, a4 -; CHECK-RV64-NEXT: .LBB36_6: -; CHECK-RV64-NEXT: mul a3, a5, a2 -; CHECK-RV64-NEXT: add a1, a1, a3 -; CHECK-RV64-NEXT: srli a4, a4, 2 -; CHECK-RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; CHECK-RV64-NEXT: vslidedown.vx v0, v24, a4 -; CHECK-RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-RV64-NEXT: addi a0, sp, 16 -; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-RV64-NEXT: vsse64.v v8, (a1), a2, v0.t -; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 -; CHECK-RV64-NEXT: add sp, sp, a0 -; CHECK-RV64-NEXT: addi sp, sp, 16 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: strided_store_nxv17f64: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a6, a4, 1 +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: mv a5, a3 +; CHECK-NEXT: bltu a3, a6, .LBB43_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a5, a6 +; CHECK-NEXT: .LBB43_2: +; CHECK-NEXT: mv a7, a5 +; CHECK-NEXT: bltu a5, a4, .LBB43_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a7, a4 +; CHECK-NEXT: .LBB43_4: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr t0, vlenb +; CHECK-NEXT: slli t0, t0, 3 +; CHECK-NEXT: sub sp, sp, t0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vl8re64.v v0, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli zero, a7, e64, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vsse64.v v8, (a1), a2, v0.t +; CHECK-NEXT: sub a0, a5, a4 +; CHECK-NEXT: sltu t0, a5, a0 +; CHECK-NEXT: addi t0, t0, -1 +; CHECK-NEXT: and a0, t0, a0 +; CHECK-NEXT: mul a7, a7, a2 +; CHECK-NEXT: add a7, a1, a7 +; CHECK-NEXT: srli t0, a4, 3 +; CHECK-NEXT: vsetvli t1, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, t0 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: sub a0, a3, a6 +; CHECK-NEXT: sltu a3, a3, a0 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a0, a3, a0 +; CHECK-NEXT: vsse64.v v16, (a7), a2, v0.t +; CHECK-NEXT: bltu a0, a4, .LBB43_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: .LBB43_6: +; CHECK-NEXT: mul a3, a5, a2 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: srli a4, a4, 2 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v24, a4 +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsse64.v v8, (a1), a2, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret call void @llvm.experimental.vp.strided.store.nxv17f64.p0.i32( %v, ptr %ptr, i32 %stride, %mask, i32 %evl) ret void } diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index dde54d36d55a1..6d152cef124b4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -104,74 +104,58 @@ define {, } @vector_deinterleave_load_nxv8i6 ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 40 +; CHECK-NEXT: li a2, 24 ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, a0, a1 -; CHECK-NEXT: vl8re64.v v24, (a1) +; CHECK-NEXT: vl8re64.v v8, (a1) ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: li a2, 24 -; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vadd.vv v16, v8, v8 ; CHECK-NEXT: vrgather.vv v8, v0, v16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vrgather.vv v0, v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v24, v16, 1 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v24, v8, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v8, v16, 1 +; CHECK-NEXT: vrgather.vv v16, v0, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vrgather.vv v24, v0, v8 ; CHECK-NEXT: vmv4r.v v0, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v12, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmv4r.v v28, v8 ; CHECK-NEXT: vmv4r.v v20, v0 +; CHECK-NEXT: vmv8r.v v8, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index b6cb7f9f5ff10..b8cef5816687b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -8,18 +8,18 @@ define {, } @vector_deinterleave_nxv16i1_nxv ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v12, v8, 1, v0 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v14, v8, 1, v0 -; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: vnsrl.wi v10, v12, 8 -; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 +; CHECK-NEXT: vnsrl.wi v12, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: vnsrl.wi v12, v8, 8 +; CHECK-NEXT: vmsne.vi v8, v12, 0 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) ret {, } %retval @@ -165,70 +165,55 @@ define {, } @vector_deinterleave_nxv8i64_nxv ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vadd.vv v0, v8, v8 ; CHECK-NEXT: vrgather.vv v8, v24, v0 -; CHECK-NEXT: vrgather.vv v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v16, v0, 1 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v16, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v8, v0, 1 +; CHECK-NEXT: vrgather.vv v0, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v0, v24, v16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v24, v0 +; CHECK-NEXT: vrgather.vv v16, v24, v8 ; CHECK-NEXT: vmv4r.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v12, v16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v20, v24 +; CHECK-NEXT: vmv4r.v v20, v8 +; CHECK-NEXT: vmv4r.v v4, v24 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 @@ -371,70 +356,55 @@ define {, } @vector_deinterleave_nxv8f ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv8r.v v24, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vadd.vv v0, v8, v8 ; CHECK-NEXT: vrgather.vv v8, v24, v0 -; CHECK-NEXT: vrgather.vv v24, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vadd.vi v16, v0, 1 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vrgather.vv v16, v8, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vadd.vi v8, v0, 1 +; CHECK-NEXT: vrgather.vv v0, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v0, v24, v16 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vrgather.vv v16, v24, v0 +; CHECK-NEXT: vrgather.vv v16, v24, v8 ; CHECK-NEXT: vmv4r.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v12, v16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmv4r.v v20, v24 +; CHECK-NEXT: vmv4r.v v20, v8 +; CHECK-NEXT: vmv4r.v v4, v24 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vmv8r.v v16, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a1, 24 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index fa2b49e862fd5..c18602c98e6b8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -1206,7 +1206,8 @@ define @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfmadd_vf_nxv32f16( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, @vfma_vv_nxv16f64( %va, ) #1 define i64 @vmv_x_s(i8 zeroext %cond, %0, %1, i64 %2) #0 { @@ -69,7 +67,6 @@ ret i64 %d } - ; Function Attrs: nounwind declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #2 define @vsetvli_add_or_sub(i8 zeroext %cond, %0, %1, i64 %avl) #0 { @@ -133,28 +130,20 @@ ret void } - ; Function Attrs: nofree nosync nounwind readnone willreturn declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) - ; Function Attrs: nounwind readnone declare @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(, , , i64) #1 - ; Function Attrs: nounwind readnone declare @llvm.riscv.vsub.nxv1i64.nxv1i64.i64(, , , i64) #1 - ; Function Attrs: nounwind readonly declare @llvm.riscv.vle.nxv1i64.i64(, * nocapture, i64) #3 - ; Function Attrs: nounwind readonly declare @llvm.riscv.vle.nxv1i32.i64(, * nocapture, i64) #3 - ; Function Attrs: nounwind writeonly declare void @llvm.riscv.vse.nxv1i64.i64(, * nocapture, i64) #4 - ; Function Attrs: nounwind readnone declare @llvm.riscv.vzext.nxv1i64.nxv1i32.i64(, , i64) #1 - ; Function Attrs: nounwind readnone declare @llvm.riscv.vsext.nxv1i64.nxv1i32.i64(, , i64) #1 attributes #0 = { "target-features"="+v" } @@ -452,7 +441,7 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v9 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr = COPY $v8 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $x10 - ; CHECK-NEXT: $x0 = PseudoVSETVLI [[COPY]], 88 /* e64, m1, ta, mu */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoVSETVLI:%[0-9]+]]:gprnox0 = PseudoVSETVLI [[COPY]], 88 /* e64, m1, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr = COPY $x0 ; CHECK-NEXT: BEQ [[COPY3]], [[COPY4]], %bb.2 ; CHECK-NEXT: PseudoBR %bb.1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll index 2a6c9d03f84c1..4d0f640408dd2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll @@ -102,23 +102,23 @@ declare @llvm.riscv.vmand.nxv1i1.i64(, This Inner Loop Header: Depth=1 -; CHECK-NEXT: slli a6, a4, 2 -; CHECK-NEXT: add a5, a0, a6 +; CHECK-NEXT: slli a3, a4, 2 +; CHECK-NEXT: add a5, a0, a3 ; CHECK-NEXT: vle32.v v8, (a5) ; CHECK-NEXT: vmsle.vi v9, v8, -3 ; CHECK-NEXT: vmsgt.vi v10, v8, 2 ; CHECK-NEXT: vmor.mm v0, v9, v10 -; CHECK-NEXT: add a6, a6, a1 -; CHECK-NEXT: vse32.v v8, (a6), v0.t -; CHECK-NEXT: add a4, a4, a3 -; CHECK-NEXT: vsetvli a3, a2, e32, m1, ta, ma -; CHECK-NEXT: bnez a3, .LBB5_2 +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vse32.v v8, (a3), v0.t +; CHECK-NEXT: add a4, a4, a6 +; CHECK-NEXT: vsetvli a6, a2, e32, m1, ta, ma +; CHECK-NEXT: bnez a6, .LBB5_2 ; CHECK-NEXT: .LBB5_3: # %for.cond.cleanup ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir index 05453ed038547..7bda7a387c68f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir @@ -362,7 +362,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr = COPY $v9 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr = COPY $v8 - ; CHECK-NEXT: $x0 = PseudoVSETVLI [[COPY]], 88 /* e64, m1, ta, mu */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoVSETVLI:%[0-9]+]]:gprnox0 = PseudoVSETVLI [[COPY]], 88 /* e64, m1, ta, mu */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt:vr = IMPLICIT_DEF ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 %pt, [[COPY2]], [[COPY1]], $noreg, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: $v8 = COPY [[PseudoVADD_VV_M1_]] diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index 458ca3e473ef7..97121c275a294 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -492,23 +492,23 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: slli a1, a2, 25 ; RV32I-NEXT: srli a1, a1, 28 ; RV32I-NEXT: addi a3, sp, 16 -; RV32I-NEXT: sub a3, a3, a1 -; RV32I-NEXT: lbu a1, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: lbu a3, 5(a1) +; RV32I-NEXT: lbu a4, 4(a1) +; RV32I-NEXT: lbu a5, 6(a1) +; RV32I-NEXT: lbu a6, 7(a1) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: andi a2, a2, 7 -; RV32I-NEXT: sll a4, a1, a2 -; RV32I-NEXT: lbu a5, 1(a3) -; RV32I-NEXT: lbu a6, 0(a3) -; RV32I-NEXT: lbu a7, 2(a3) -; RV32I-NEXT: lbu t0, 3(a3) +; RV32I-NEXT: sll a4, a3, a2 +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 2(a1) +; RV32I-NEXT: lbu t0, 3(a1) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 @@ -519,10 +519,10 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: xori a7, a2, 31 ; RV32I-NEXT: srl a6, a6, a7 ; RV32I-NEXT: or a4, a4, a6 -; RV32I-NEXT: lbu a6, 9(a3) -; RV32I-NEXT: lbu t0, 8(a3) -; RV32I-NEXT: lbu t1, 10(a3) -; RV32I-NEXT: lbu t2, 11(a3) +; RV32I-NEXT: lbu a6, 9(a1) +; RV32I-NEXT: lbu t0, 8(a1) +; RV32I-NEXT: lbu t1, 10(a1) +; RV32I-NEXT: lbu t2, 11(a1) ; RV32I-NEXT: slli a6, a6, 8 ; RV32I-NEXT: or a6, a6, t0 ; RV32I-NEXT: slli t1, t1, 16 @@ -530,28 +530,28 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a6, t0, a6 ; RV32I-NEXT: sll t0, a6, a2 -; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: srli a3, a3, 1 ; RV32I-NEXT: not t1, a2 -; RV32I-NEXT: srl a1, a1, t1 -; RV32I-NEXT: or a1, t0, a1 -; RV32I-NEXT: lbu t0, 13(a3) -; RV32I-NEXT: lbu t1, 12(a3) -; RV32I-NEXT: lbu t2, 14(a3) -; RV32I-NEXT: lbu a3, 15(a3) +; RV32I-NEXT: srl a3, a3, t1 +; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: lbu t0, 13(a1) +; RV32I-NEXT: lbu t1, 12(a1) +; RV32I-NEXT: lbu t2, 14(a1) +; RV32I-NEXT: lbu a1, 15(a1) ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or t0, t0, t1 ; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t2 -; RV32I-NEXT: or a3, a3, t0 -; RV32I-NEXT: sll a3, a3, a2 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: sll a1, a1, a2 ; RV32I-NEXT: srli a6, a6, 1 ; RV32I-NEXT: srl a6, a6, a7 -; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: sll a2, a5, a2 ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a3, 12(a0) -; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a1, 12(a0) +; RV32I-NEXT: sw a3, 8(a0) ; RV32I-NEXT: sw a4, 4(a0) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret @@ -581,24 +581,24 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; RV32I-LABEL: fshr64_minsize: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a5, a2, 32 +; RV32I-NEXT: andi a4, a2, 32 ; RV32I-NEXT: mv a3, a0 -; RV32I-NEXT: beqz a5, .LBB9_2 +; RV32I-NEXT: beqz a4, .LBB9_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: srl a4, a3, a2 -; RV32I-NEXT: beqz a5, .LBB9_4 +; RV32I-NEXT: srl a5, a3, a2 +; RV32I-NEXT: beqz a4, .LBB9_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: .LBB9_4: ; RV32I-NEXT: slli a0, a1, 1 -; RV32I-NEXT: not a5, a2 -; RV32I-NEXT: sll a0, a0, a5 -; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: not a4, a2 +; RV32I-NEXT: sll a0, a0, a4 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: srl a1, a1, a2 ; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: sll a2, a3, a5 +; RV32I-NEXT: sll a2, a3, a4 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: ret ; @@ -617,56 +617,56 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-LABEL: fshr128_minsize: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a3, 8(a1) -; RV32I-NEXT: lw t1, 0(a1) +; RV32I-NEXT: lw t2, 0(a1) ; RV32I-NEXT: lw a2, 0(a2) -; RV32I-NEXT: lw t0, 4(a1) +; RV32I-NEXT: lw a7, 4(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: andi t2, a2, 64 -; RV32I-NEXT: mv a7, t0 -; RV32I-NEXT: mv a4, t1 -; RV32I-NEXT: beqz t2, .LBB10_2 +; RV32I-NEXT: andi t1, a2, 64 +; RV32I-NEXT: mv t0, a7 +; RV32I-NEXT: mv a4, t2 +; RV32I-NEXT: beqz t1, .LBB10_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: mv a7, a1 +; RV32I-NEXT: mv t0, a1 ; RV32I-NEXT: mv a4, a3 ; RV32I-NEXT: .LBB10_2: ; RV32I-NEXT: andi a6, a2, 32 ; RV32I-NEXT: mv a5, a4 ; RV32I-NEXT: bnez a6, .LBB10_13 ; RV32I-NEXT: # %bb.3: -; RV32I-NEXT: bnez t2, .LBB10_14 +; RV32I-NEXT: bnez t1, .LBB10_14 ; RV32I-NEXT: .LBB10_4: ; RV32I-NEXT: beqz a6, .LBB10_6 ; RV32I-NEXT: .LBB10_5: -; RV32I-NEXT: mv a7, a3 +; RV32I-NEXT: mv t0, a3 ; RV32I-NEXT: .LBB10_6: -; RV32I-NEXT: slli t3, a7, 1 -; RV32I-NEXT: not t1, a2 -; RV32I-NEXT: beqz t2, .LBB10_8 +; RV32I-NEXT: slli t3, t0, 1 +; RV32I-NEXT: not t2, a2 +; RV32I-NEXT: beqz t1, .LBB10_8 ; RV32I-NEXT: # %bb.7: -; RV32I-NEXT: mv a1, t0 +; RV32I-NEXT: mv a1, a7 ; RV32I-NEXT: .LBB10_8: -; RV32I-NEXT: srl t2, a5, a2 -; RV32I-NEXT: sll t3, t3, t1 -; RV32I-NEXT: srl t0, a7, a2 +; RV32I-NEXT: srl a7, a5, a2 +; RV32I-NEXT: sll t1, t3, t2 +; RV32I-NEXT: srl t0, t0, a2 ; RV32I-NEXT: beqz a6, .LBB10_10 ; RV32I-NEXT: # %bb.9: ; RV32I-NEXT: mv a3, a1 ; RV32I-NEXT: .LBB10_10: -; RV32I-NEXT: or a7, t3, t2 -; RV32I-NEXT: slli t2, a3, 1 -; RV32I-NEXT: sll t2, t2, t1 -; RV32I-NEXT: or t0, t2, t0 +; RV32I-NEXT: or a7, t1, a7 +; RV32I-NEXT: slli t1, a3, 1 +; RV32I-NEXT: sll t1, t1, t2 +; RV32I-NEXT: or t0, t1, t0 ; RV32I-NEXT: srl a3, a3, a2 ; RV32I-NEXT: beqz a6, .LBB10_12 ; RV32I-NEXT: # %bb.11: ; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: .LBB10_12: ; RV32I-NEXT: slli a4, a1, 1 -; RV32I-NEXT: sll a4, a4, t1 +; RV32I-NEXT: sll a4, a4, t2 ; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: srl a1, a1, a2 ; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: sll a2, a5, t1 +; RV32I-NEXT: sll a2, a5, t2 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: sw a1, 12(a0) ; RV32I-NEXT: sw a3, 8(a0) @@ -674,33 +674,33 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV32I-NEXT: sw a7, 0(a0) ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB10_13: -; RV32I-NEXT: mv a5, a7 -; RV32I-NEXT: beqz t2, .LBB10_4 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: beqz t1, .LBB10_4 ; RV32I-NEXT: .LBB10_14: -; RV32I-NEXT: mv a3, t1 +; RV32I-NEXT: mv a3, t2 ; RV32I-NEXT: bnez a6, .LBB10_5 ; RV32I-NEXT: j .LBB10_6 ; ; RV64I-LABEL: fshr128_minsize: ; RV64I: # %bb.0: -; RV64I-NEXT: andi a5, a2, 64 +; RV64I-NEXT: andi a4, a2, 64 ; RV64I-NEXT: mv a3, a0 -; RV64I-NEXT: beqz a5, .LBB10_2 +; RV64I-NEXT: beqz a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a3, a1 ; RV64I-NEXT: .LBB10_2: -; RV64I-NEXT: srl a4, a3, a2 -; RV64I-NEXT: beqz a5, .LBB10_4 +; RV64I-NEXT: srl a5, a3, a2 +; RV64I-NEXT: beqz a4, .LBB10_4 ; RV64I-NEXT: # %bb.3: ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: .LBB10_4: ; RV64I-NEXT: slli a0, a1, 1 -; RV64I-NEXT: not a5, a2 -; RV64I-NEXT: sll a0, a0, a5 -; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: not a4, a2 +; RV64I-NEXT: sll a0, a0, a4 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: srl a1, a1, a2 ; RV64I-NEXT: slli a3, a3, 1 -; RV64I-NEXT: sll a2, a3, a5 +; RV64I-NEXT: sll a2, a3, a4 ; RV64I-NEXT: or a1, a2, a1 ; RV64I-NEXT: ret %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b) diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 231c066de5437..b5f1efa4b160b 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -1085,15 +1085,15 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 24(a1) -; RV32I-NEXT: lw s1, 28(a1) -; RV32I-NEXT: lw s2, 16(a1) -; RV32I-NEXT: lw s3, 20(a1) -; RV32I-NEXT: lw s4, 8(a1) -; RV32I-NEXT: lw s5, 12(a1) +; RV32I-NEXT: lw s1, 24(a1) +; RV32I-NEXT: lw s2, 28(a1) +; RV32I-NEXT: lw s3, 16(a1) +; RV32I-NEXT: lw s4, 20(a1) +; RV32I-NEXT: lw s5, 8(a1) +; RV32I-NEXT: lw s6, 12(a1) ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: li a3, 0 @@ -1101,33 +1101,33 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: mv s8, a1 ; RV32I-NEXT: li a2, 654 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a0, s5 +; RV32I-NEXT: mv a1, s6 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: mv s5, a1 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: mv s6, a1 ; RV32I-NEXT: li a2, 23 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a1, s4 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s4, a1 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3@plt -; RV32I-NEXT: sw a1, 28(s6) -; RV32I-NEXT: sw a0, 24(s6) -; RV32I-NEXT: sw s3, 20(s6) -; RV32I-NEXT: sw s2, 16(s6) -; RV32I-NEXT: sw s5, 12(s6) -; RV32I-NEXT: sw s4, 8(s6) -; RV32I-NEXT: sw s8, 4(s6) -; RV32I-NEXT: sw s7, 0(s6) +; RV32I-NEXT: sw a1, 28(s0) +; RV32I-NEXT: sw a0, 24(s0) +; RV32I-NEXT: sw s4, 20(s0) +; RV32I-NEXT: sw s3, 16(s0) +; RV32I-NEXT: sw s6, 12(s0) +; RV32I-NEXT: sw s5, 8(s0) +; RV32I-NEXT: sw s8, 4(s0) +; RV32I-NEXT: sw s7, 0(s0) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -1154,15 +1154,15 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s0, 24(a1) -; RV32IM-NEXT: lw s1, 28(a1) -; RV32IM-NEXT: lw s2, 16(a1) -; RV32IM-NEXT: lw s3, 20(a1) -; RV32IM-NEXT: lw s4, 8(a1) -; RV32IM-NEXT: lw s5, 12(a1) +; RV32IM-NEXT: lw s1, 24(a1) +; RV32IM-NEXT: lw s2, 28(a1) +; RV32IM-NEXT: lw s3, 16(a1) +; RV32IM-NEXT: lw s4, 20(a1) +; RV32IM-NEXT: lw s5, 8(a1) +; RV32IM-NEXT: lw s6, 12(a1) ; RV32IM-NEXT: lw a3, 0(a1) ; RV32IM-NEXT: lw a1, 4(a1) -; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: mv s0, a0 ; RV32IM-NEXT: li a2, 1 ; RV32IM-NEXT: mv a0, a3 ; RV32IM-NEXT: li a3, 0 @@ -1170,33 +1170,33 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: mv s7, a0 ; RV32IM-NEXT: mv s8, a1 ; RV32IM-NEXT: li a2, 654 -; RV32IM-NEXT: mv a0, s4 -; RV32IM-NEXT: mv a1, s5 +; RV32IM-NEXT: mv a0, s5 +; RV32IM-NEXT: mv a1, s6 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: mv s4, a0 -; RV32IM-NEXT: mv s5, a1 +; RV32IM-NEXT: mv s5, a0 +; RV32IM-NEXT: mv s6, a1 ; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: mv a0, s3 +; RV32IM-NEXT: mv a1, s4 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: mv s2, a0 -; RV32IM-NEXT: mv s3, a1 +; RV32IM-NEXT: mv s3, a0 +; RV32IM-NEXT: mv s4, a1 ; RV32IM-NEXT: lui a0, 1 ; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s0 -; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: mv a0, s1 +; RV32IM-NEXT: mv a1, s2 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3@plt -; RV32IM-NEXT: sw a1, 28(s6) -; RV32IM-NEXT: sw a0, 24(s6) -; RV32IM-NEXT: sw s3, 20(s6) -; RV32IM-NEXT: sw s2, 16(s6) -; RV32IM-NEXT: sw s5, 12(s6) -; RV32IM-NEXT: sw s4, 8(s6) -; RV32IM-NEXT: sw s8, 4(s6) -; RV32IM-NEXT: sw s7, 0(s6) +; RV32IM-NEXT: sw a1, 28(s0) +; RV32IM-NEXT: sw a0, 24(s0) +; RV32IM-NEXT: sw s4, 20(s0) +; RV32IM-NEXT: sw s3, 16(s0) +; RV32IM-NEXT: sw s6, 12(s0) +; RV32IM-NEXT: sw s5, 8(s0) +; RV32IM-NEXT: sw s8, 4(s0) +; RV32IM-NEXT: sw s7, 0(s0) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll index ae78b3e9d6a2e..651df94bab496 100644 --- a/llvm/test/CodeGen/RISCV/stack-store-check.ll +++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll @@ -45,7 +45,7 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a2, sp, 584 ; CHECK-NEXT: sw s6, 584(sp) ; CHECK-NEXT: call __subtf3@plt -; CHECK-NEXT: lw s9, 616(sp) +; CHECK-NEXT: lw s1, 616(sp) ; CHECK-NEXT: lw s2, 620(sp) ; CHECK-NEXT: lw s3, 624(sp) ; CHECK-NEXT: lw s4, 628(sp) @@ -59,14 +59,14 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a0, sp, 568 ; CHECK-NEXT: addi a1, sp, 552 ; CHECK-NEXT: addi a2, sp, 536 -; CHECK-NEXT: sw s9, 552(sp) +; CHECK-NEXT: sw s1, 552(sp) ; CHECK-NEXT: call __subtf3@plt ; CHECK-NEXT: lw a0, 568(sp) ; CHECK-NEXT: sw a0, 40(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 572(sp) -; CHECK-NEXT: sw a0, 32(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a0, 28(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 576(sp) -; CHECK-NEXT: sw a0, 24(sp) # 4-byte Folded Spill +; CHECK-NEXT: sw a0, 20(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 580(sp) ; CHECK-NEXT: sw a0, 48(sp) # 4-byte Folded Spill ; CHECK-NEXT: sw zero, 500(sp) @@ -81,12 +81,11 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a2, sp, 488 ; CHECK-NEXT: sw s6, 504(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw s11, 520(sp) -; CHECK-NEXT: lw s10, 524(sp) +; CHECK-NEXT: lw s9, 520(sp) +; CHECK-NEXT: lw s11, 524(sp) ; CHECK-NEXT: lw s5, 528(sp) -; CHECK-NEXT: sw s5, 20(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw s1, 532(sp) -; CHECK-NEXT: sw s1, 16(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw s10, 532(sp) +; CHECK-NEXT: sw s10, 16(sp) # 4-byte Folded Spill ; CHECK-NEXT: lui a0, %hi(Y1) ; CHECK-NEXT: lw a1, %lo(Y1)(a0) ; CHECK-NEXT: sw a1, 52(sp) # 4-byte Folded Spill @@ -106,26 +105,27 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a0, sp, 328 ; CHECK-NEXT: addi a1, sp, 312 ; CHECK-NEXT: addi a2, sp, 296 -; CHECK-NEXT: sw s9, 312(sp) +; CHECK-NEXT: sw s1, 312(sp) ; CHECK-NEXT: call __multf3@plt ; CHECK-NEXT: lw a0, 328(sp) ; CHECK-NEXT: sw a0, 44(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 332(sp) ; CHECK-NEXT: sw a0, 36(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw a0, 336(sp) -; CHECK-NEXT: sw a0, 28(sp) # 4-byte Folded Spill -; CHECK-NEXT: lw s9, 340(sp) +; CHECK-NEXT: sw a0, 32(sp) # 4-byte Folded Spill +; CHECK-NEXT: lw a0, 340(sp) +; CHECK-NEXT: sw a0, 24(sp) # 4-byte Folded Spill ; CHECK-NEXT: sw s0, 468(sp) ; CHECK-NEXT: sw s8, 464(sp) ; CHECK-NEXT: sw s7, 460(sp) ; CHECK-NEXT: sw s6, 456(sp) -; CHECK-NEXT: sw s1, 452(sp) +; CHECK-NEXT: sw s10, 452(sp) ; CHECK-NEXT: sw s5, 448(sp) -; CHECK-NEXT: sw s10, 444(sp) +; CHECK-NEXT: sw s11, 444(sp) ; CHECK-NEXT: addi a0, sp, 472 ; CHECK-NEXT: addi a1, sp, 456 ; CHECK-NEXT: addi a2, sp, 440 -; CHECK-NEXT: sw s11, 440(sp) +; CHECK-NEXT: sw s9, 440(sp) ; CHECK-NEXT: call __addtf3@plt ; CHECK-NEXT: lw a3, 472(sp) ; CHECK-NEXT: lw a0, 476(sp) @@ -152,27 +152,27 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw a2, %lo(X+8)(a4) ; CHECK-NEXT: sw a3, %lo(X+4)(a4) ; CHECK-NEXT: sw a0, %lo(X)(a4) -; CHECK-NEXT: lw s4, 4(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s4, 212(sp) -; CHECK-NEXT: lw s3, 8(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s3, 208(sp) -; CHECK-NEXT: lw s2, 12(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s2, 204(sp) +; CHECK-NEXT: lw s8, 4(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s8, 212(sp) +; CHECK-NEXT: lw s4, 8(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s4, 208(sp) +; CHECK-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s3, 204(sp) ; CHECK-NEXT: lw a0, 52(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 200(sp) ; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 228(sp) -; CHECK-NEXT: lw s1, 24(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s1, 224(sp) -; CHECK-NEXT: lw s0, 32(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s0, 220(sp) +; CHECK-NEXT: lw s10, 20(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s10, 224(sp) +; CHECK-NEXT: lw s2, 28(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s2, 220(sp) ; CHECK-NEXT: addi a0, sp, 232 ; CHECK-NEXT: addi a1, sp, 216 ; CHECK-NEXT: addi a2, sp, 200 -; CHECK-NEXT: lw s8, 40(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw s8, 216(sp) +; CHECK-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s0, 216(sp) ; CHECK-NEXT: call __multf3@plt -; CHECK-NEXT: lw s5, 232(sp) +; CHECK-NEXT: lw s1, 232(sp) ; CHECK-NEXT: lw a0, 236(sp) ; CHECK-NEXT: sw a0, 0(sp) # 4-byte Folded Spill ; CHECK-NEXT: lw s6, 240(sp) @@ -183,13 +183,12 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw zero, 344(sp) ; CHECK-NEXT: lw a0, 16(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 372(sp) -; CHECK-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; CHECK-NEXT: sw a0, 368(sp) -; CHECK-NEXT: sw s10, 364(sp) +; CHECK-NEXT: sw s5, 368(sp) +; CHECK-NEXT: sw s11, 364(sp) ; CHECK-NEXT: addi a0, sp, 376 ; CHECK-NEXT: addi a1, sp, 360 ; CHECK-NEXT: addi a2, sp, 344 -; CHECK-NEXT: sw s11, 360(sp) +; CHECK-NEXT: sw s9, 360(sp) ; CHECK-NEXT: call __multf3@plt ; CHECK-NEXT: lw a0, 376(sp) ; CHECK-NEXT: lw a1, 388(sp) @@ -202,11 +201,12 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw a0, %lo(S)(a4) ; CHECK-NEXT: lw a0, 48(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 260(sp) -; CHECK-NEXT: sw s1, 256(sp) -; CHECK-NEXT: sw s0, 252(sp) -; CHECK-NEXT: sw s8, 248(sp) -; CHECK-NEXT: sw s9, 276(sp) -; CHECK-NEXT: lw a0, 28(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw s10, 256(sp) +; CHECK-NEXT: sw s2, 252(sp) +; CHECK-NEXT: sw s0, 248(sp) +; CHECK-NEXT: lw a0, 24(sp) # 4-byte Folded Reload +; CHECK-NEXT: sw a0, 276(sp) +; CHECK-NEXT: lw a0, 32(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 272(sp) ; CHECK-NEXT: lw a0, 36(sp) # 4-byte Folded Reload ; CHECK-NEXT: sw a0, 268(sp) @@ -236,7 +236,7 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: addi a0, sp, 184 ; CHECK-NEXT: addi a1, sp, 168 ; CHECK-NEXT: addi a2, sp, 152 -; CHECK-NEXT: sw s5, 168(sp) +; CHECK-NEXT: sw s1, 168(sp) ; CHECK-NEXT: call __addtf3@plt ; CHECK-NEXT: lw a0, 184(sp) ; CHECK-NEXT: lw a1, 196(sp) @@ -251,9 +251,9 @@ define void @main() local_unnamed_addr nounwind { ; CHECK-NEXT: sw zero, 112(sp) ; CHECK-NEXT: sw zero, 108(sp) ; CHECK-NEXT: sw zero, 104(sp) -; CHECK-NEXT: sw s4, 132(sp) -; CHECK-NEXT: sw s3, 128(sp) -; CHECK-NEXT: sw s2, 124(sp) +; CHECK-NEXT: sw s8, 132(sp) +; CHECK-NEXT: sw s4, 128(sp) +; CHECK-NEXT: sw s3, 124(sp) ; CHECK-NEXT: addi a0, sp, 136 ; CHECK-NEXT: addi a1, sp, 120 ; CHECK-NEXT: addi a2, sp, 104 diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll index d57c62170699a..8c0d97afe6c21 100644 --- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll @@ -15,34 +15,34 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: lw a6, 8(a1) ; RISCV32-NEXT: lw a4, 0(a2) ; RISCV32-NEXT: lw a5, 0(a1) -; RISCV32-NEXT: lw t3, 4(a1) +; RISCV32-NEXT: lw t2, 4(a1) ; RISCV32-NEXT: lw t0, 8(a2) ; RISCV32-NEXT: lw a2, 4(a2) ; RISCV32-NEXT: mulhu a1, a5, a4 -; RISCV32-NEXT: mul t1, t3, a4 +; RISCV32-NEXT: mul t1, t2, a4 ; RISCV32-NEXT: add a1, t1, a1 ; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t2, t3, a4 -; RISCV32-NEXT: add t4, t2, t1 +; RISCV32-NEXT: mulhu t3, t2, a4 +; RISCV32-NEXT: add t4, t3, t1 ; RISCV32-NEXT: mul t1, a5, a2 ; RISCV32-NEXT: add a1, t1, a1 ; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t2, a5, a2 -; RISCV32-NEXT: add t1, t2, t1 +; RISCV32-NEXT: mulhu t3, a5, a2 +; RISCV32-NEXT: add t1, t3, t1 ; RISCV32-NEXT: add t5, t4, t1 -; RISCV32-NEXT: mul t6, t3, a2 +; RISCV32-NEXT: mul t6, t2, a2 ; RISCV32-NEXT: add s0, t6, t5 ; RISCV32-NEXT: mul t1, t0, a5 ; RISCV32-NEXT: mul s3, a6, a4 ; RISCV32-NEXT: add s4, s3, t1 ; RISCV32-NEXT: add t1, s0, s4 -; RISCV32-NEXT: sltu t2, t1, s0 +; RISCV32-NEXT: sltu t3, t1, s0 ; RISCV32-NEXT: sltu s0, s0, t6 ; RISCV32-NEXT: sltu t4, t5, t4 -; RISCV32-NEXT: mulhu t5, t3, a2 +; RISCV32-NEXT: mulhu t5, t2, a2 ; RISCV32-NEXT: add t4, t5, t4 ; RISCV32-NEXT: add s0, t4, s0 -; RISCV32-NEXT: mul t4, t3, t0 +; RISCV32-NEXT: mul t4, t2, t0 ; RISCV32-NEXT: mul t5, a7, a5 ; RISCV32-NEXT: add t4, t5, t4 ; RISCV32-NEXT: mulhu s1, t0, a5 @@ -56,22 +56,22 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: sltu s3, s4, s3 ; RISCV32-NEXT: add t4, t4, s3 ; RISCV32-NEXT: add t4, s0, t4 -; RISCV32-NEXT: add t4, t4, t2 +; RISCV32-NEXT: add t4, t4, t3 ; RISCV32-NEXT: beq t4, s0, .LBB0_2 ; RISCV32-NEXT: # %bb.1: # %start -; RISCV32-NEXT: sltu t2, t4, s0 +; RISCV32-NEXT: sltu t3, t4, s0 ; RISCV32-NEXT: .LBB0_2: # %start ; RISCV32-NEXT: sltu s0, s2, s1 -; RISCV32-NEXT: snez s1, t3 +; RISCV32-NEXT: snez s1, t2 ; RISCV32-NEXT: snez s2, a7 ; RISCV32-NEXT: and s1, s2, s1 ; RISCV32-NEXT: mulhu s2, a7, a5 ; RISCV32-NEXT: snez s2, s2 ; RISCV32-NEXT: or s1, s1, s2 -; RISCV32-NEXT: mulhu t3, t3, t0 -; RISCV32-NEXT: snez t3, t3 -; RISCV32-NEXT: or t3, s1, t3 -; RISCV32-NEXT: or t3, t3, s0 +; RISCV32-NEXT: mulhu t2, t2, t0 +; RISCV32-NEXT: snez t2, t2 +; RISCV32-NEXT: or t2, s1, t2 +; RISCV32-NEXT: or t2, t2, s0 ; RISCV32-NEXT: sltu t5, t6, t5 ; RISCV32-NEXT: snez t6, a2 ; RISCV32-NEXT: snez s0, a3 @@ -89,7 +89,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 { ; RISCV32-NEXT: snez a3, a3 ; RISCV32-NEXT: and a3, a3, a7 ; RISCV32-NEXT: or a2, a3, a2 -; RISCV32-NEXT: or a3, t3, t2 +; RISCV32-NEXT: or a3, t2, t3 ; RISCV32-NEXT: or a2, a2, a3 ; RISCV32-NEXT: mul a3, a5, a4 ; RISCV32-NEXT: andi a2, a2, 1 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index d8f364ec8c00f..a38ae17f19df3 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -791,15 +791,15 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lw s0, 24(a1) -; RV32I-NEXT: lw s1, 28(a1) -; RV32I-NEXT: lw s2, 16(a1) -; RV32I-NEXT: lw s3, 20(a1) -; RV32I-NEXT: lw s4, 8(a1) -; RV32I-NEXT: lw s5, 12(a1) +; RV32I-NEXT: lw s1, 24(a1) +; RV32I-NEXT: lw s2, 28(a1) +; RV32I-NEXT: lw s3, 16(a1) +; RV32I-NEXT: lw s4, 20(a1) +; RV32I-NEXT: lw s5, 8(a1) +; RV32I-NEXT: lw s6, 12(a1) ; RV32I-NEXT: lw a3, 0(a1) ; RV32I-NEXT: lw a1, 4(a1) -; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: li a2, 1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: li a3, 0 @@ -807,33 +807,33 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: mv s7, a0 ; RV32I-NEXT: mv s8, a1 ; RV32I-NEXT: li a2, 654 -; RV32I-NEXT: mv a0, s4 -; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a0, s5 +; RV32I-NEXT: mv a1, s6 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: mv s4, a0 -; RV32I-NEXT: mv s5, a1 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: mv s6, a1 ; RV32I-NEXT: li a2, 23 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a1, s4 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: mv s3, a1 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s4, a1 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a2, a0, 1327 -; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s2 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3@plt -; RV32I-NEXT: sw a1, 28(s6) -; RV32I-NEXT: sw a0, 24(s6) -; RV32I-NEXT: sw s3, 20(s6) -; RV32I-NEXT: sw s2, 16(s6) -; RV32I-NEXT: sw s5, 12(s6) -; RV32I-NEXT: sw s4, 8(s6) -; RV32I-NEXT: sw s8, 4(s6) -; RV32I-NEXT: sw s7, 0(s6) +; RV32I-NEXT: sw a1, 28(s0) +; RV32I-NEXT: sw a0, 24(s0) +; RV32I-NEXT: sw s4, 20(s0) +; RV32I-NEXT: sw s3, 16(s0) +; RV32I-NEXT: sw s6, 12(s0) +; RV32I-NEXT: sw s5, 8(s0) +; RV32I-NEXT: sw s8, 4(s0) +; RV32I-NEXT: sw s7, 0(s0) ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -860,15 +860,15 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s0, 24(a1) -; RV32IM-NEXT: lw s1, 28(a1) -; RV32IM-NEXT: lw s2, 16(a1) -; RV32IM-NEXT: lw s3, 20(a1) -; RV32IM-NEXT: lw s4, 8(a1) -; RV32IM-NEXT: lw s5, 12(a1) +; RV32IM-NEXT: lw s1, 24(a1) +; RV32IM-NEXT: lw s2, 28(a1) +; RV32IM-NEXT: lw s3, 16(a1) +; RV32IM-NEXT: lw s4, 20(a1) +; RV32IM-NEXT: lw s5, 8(a1) +; RV32IM-NEXT: lw s6, 12(a1) ; RV32IM-NEXT: lw a3, 0(a1) ; RV32IM-NEXT: lw a1, 4(a1) -; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: mv s0, a0 ; RV32IM-NEXT: li a2, 1 ; RV32IM-NEXT: mv a0, a3 ; RV32IM-NEXT: li a3, 0 @@ -876,33 +876,33 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: mv s7, a0 ; RV32IM-NEXT: mv s8, a1 ; RV32IM-NEXT: li a2, 654 -; RV32IM-NEXT: mv a0, s4 -; RV32IM-NEXT: mv a1, s5 +; RV32IM-NEXT: mv a0, s5 +; RV32IM-NEXT: mv a1, s6 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: mv s4, a0 -; RV32IM-NEXT: mv s5, a1 +; RV32IM-NEXT: mv s5, a0 +; RV32IM-NEXT: mv s6, a1 ; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s2 -; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: mv a0, s3 +; RV32IM-NEXT: mv a1, s4 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: mv s2, a0 -; RV32IM-NEXT: mv s3, a1 +; RV32IM-NEXT: mv s3, a0 +; RV32IM-NEXT: mv s4, a1 ; RV32IM-NEXT: lui a0, 1 ; RV32IM-NEXT: addi a2, a0, 1327 -; RV32IM-NEXT: mv a0, s0 -; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: mv a0, s1 +; RV32IM-NEXT: mv a1, s2 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3@plt -; RV32IM-NEXT: sw a1, 28(s6) -; RV32IM-NEXT: sw a0, 24(s6) -; RV32IM-NEXT: sw s3, 20(s6) -; RV32IM-NEXT: sw s2, 16(s6) -; RV32IM-NEXT: sw s5, 12(s6) -; RV32IM-NEXT: sw s4, 8(s6) -; RV32IM-NEXT: sw s8, 4(s6) -; RV32IM-NEXT: sw s7, 0(s6) +; RV32IM-NEXT: sw a1, 28(s0) +; RV32IM-NEXT: sw a0, 24(s0) +; RV32IM-NEXT: sw s4, 20(s0) +; RV32IM-NEXT: sw s3, 16(s0) +; RV32IM-NEXT: sw s6, 12(s0) +; RV32IM-NEXT: sw s5, 8(s0) +; RV32IM-NEXT: sw s8, 4(s0) +; RV32IM-NEXT: sw s7, 0(s0) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index 3550ac8de9648..b0d435368e92b 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1397,50 +1397,50 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 56(sp) ; RV64I-NEXT: andi a1, a1, 31 ; RV64I-NEXT: addi a0, sp, 56 -; RV64I-NEXT: add a5, a0, a1 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: add a6, a0, a1 +; RV64I-NEXT: lbu a0, 8(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 9(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 10(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 11(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 12(a6) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) +; RV64I-NEXT: lbu a7, 13(a6) +; RV64I-NEXT: lbu t0, 14(a6) +; RV64I-NEXT: lbu t1, 15(a6) +; RV64I-NEXT: lbu t2, 0(a6) +; RV64I-NEXT: lbu t3, 1(a6) +; RV64I-NEXT: lbu t4, 2(a6) +; RV64I-NEXT: lbu t5, 3(a6) +; RV64I-NEXT: lbu t6, 4(a6) +; RV64I-NEXT: lbu s0, 5(a6) +; RV64I-NEXT: lbu s1, 6(a6) +; RV64I-NEXT: lbu s2, 7(a6) +; RV64I-NEXT: lbu s3, 24(a6) +; RV64I-NEXT: lbu s4, 25(a6) +; RV64I-NEXT: lbu s5, 26(a6) +; RV64I-NEXT: lbu s6, 27(a6) +; RV64I-NEXT: lbu s7, 28(a6) +; RV64I-NEXT: lbu s8, 29(a6) +; RV64I-NEXT: lbu s9, 30(a6) +; RV64I-NEXT: lbu s10, 31(a6) +; RV64I-NEXT: lbu s11, 16(a6) +; RV64I-NEXT: lbu ra, 17(a6) +; RV64I-NEXT: lbu a5, 18(a6) +; RV64I-NEXT: lbu a4, 19(a6) +; RV64I-NEXT: lbu a0, 23(a6) +; RV64I-NEXT: lbu a1, 22(a6) +; RV64I-NEXT: lbu a3, 21(a6) +; RV64I-NEXT: lbu a6, 20(a6) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: sb a6, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: sb a5, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -1615,50 +1615,50 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 28(sp) ; RV32I-NEXT: andi a1, a1, 31 ; RV32I-NEXT: addi a0, sp, 28 -; RV32I-NEXT: add a5, a0, a1 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: add a6, a0, a1 +; RV32I-NEXT: lbu a0, 6(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 7(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 4(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 5(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 0(a6) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) +; RV32I-NEXT: lbu a7, 1(a6) +; RV32I-NEXT: lbu t0, 2(a6) +; RV32I-NEXT: lbu t1, 3(a6) +; RV32I-NEXT: lbu t2, 14(a6) +; RV32I-NEXT: lbu t3, 15(a6) +; RV32I-NEXT: lbu t4, 12(a6) +; RV32I-NEXT: lbu t5, 13(a6) +; RV32I-NEXT: lbu t6, 10(a6) +; RV32I-NEXT: lbu s0, 11(a6) +; RV32I-NEXT: lbu s1, 8(a6) +; RV32I-NEXT: lbu s2, 9(a6) +; RV32I-NEXT: lbu s3, 22(a6) +; RV32I-NEXT: lbu s4, 23(a6) +; RV32I-NEXT: lbu s5, 20(a6) +; RV32I-NEXT: lbu s6, 21(a6) +; RV32I-NEXT: lbu s7, 18(a6) +; RV32I-NEXT: lbu s8, 19(a6) +; RV32I-NEXT: lbu s9, 16(a6) +; RV32I-NEXT: lbu s10, 17(a6) +; RV32I-NEXT: lbu s11, 30(a6) +; RV32I-NEXT: lbu ra, 31(a6) +; RV32I-NEXT: lbu a5, 28(a6) +; RV32I-NEXT: lbu a4, 29(a6) +; RV32I-NEXT: lbu a0, 25(a6) +; RV32I-NEXT: lbu a1, 24(a6) +; RV32I-NEXT: lbu a3, 27(a6) +; RV32I-NEXT: lbu a6, 26(a6) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) +; RV32I-NEXT: sb a6, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) +; RV32I-NEXT: sb a5, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) @@ -1840,50 +1840,50 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 88(sp) ; RV64I-NEXT: andi a1, a1, 31 ; RV64I-NEXT: addi a0, sp, 88 -; RV64I-NEXT: sub a5, a0, a1 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: sub a6, a0, a1 +; RV64I-NEXT: lbu a0, 8(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 9(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 10(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 11(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 12(a6) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) +; RV64I-NEXT: lbu a7, 13(a6) +; RV64I-NEXT: lbu t0, 14(a6) +; RV64I-NEXT: lbu t1, 15(a6) +; RV64I-NEXT: lbu t2, 0(a6) +; RV64I-NEXT: lbu t3, 1(a6) +; RV64I-NEXT: lbu t4, 2(a6) +; RV64I-NEXT: lbu t5, 3(a6) +; RV64I-NEXT: lbu t6, 4(a6) +; RV64I-NEXT: lbu s0, 5(a6) +; RV64I-NEXT: lbu s1, 6(a6) +; RV64I-NEXT: lbu s2, 7(a6) +; RV64I-NEXT: lbu s3, 24(a6) +; RV64I-NEXT: lbu s4, 25(a6) +; RV64I-NEXT: lbu s5, 26(a6) +; RV64I-NEXT: lbu s6, 27(a6) +; RV64I-NEXT: lbu s7, 28(a6) +; RV64I-NEXT: lbu s8, 29(a6) +; RV64I-NEXT: lbu s9, 30(a6) +; RV64I-NEXT: lbu s10, 31(a6) +; RV64I-NEXT: lbu s11, 16(a6) +; RV64I-NEXT: lbu ra, 17(a6) +; RV64I-NEXT: lbu a5, 18(a6) +; RV64I-NEXT: lbu a4, 19(a6) +; RV64I-NEXT: lbu a0, 23(a6) +; RV64I-NEXT: lbu a1, 22(a6) +; RV64I-NEXT: lbu a3, 21(a6) +; RV64I-NEXT: lbu a6, 20(a6) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: sb a6, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: sb a5, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -2058,50 +2058,50 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 60(sp) ; RV32I-NEXT: andi a1, a1, 31 ; RV32I-NEXT: addi a0, sp, 60 -; RV32I-NEXT: sub a5, a0, a1 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: sub a6, a0, a1 +; RV32I-NEXT: lbu a0, 6(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 7(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 4(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 5(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 0(a6) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) +; RV32I-NEXT: lbu a7, 1(a6) +; RV32I-NEXT: lbu t0, 2(a6) +; RV32I-NEXT: lbu t1, 3(a6) +; RV32I-NEXT: lbu t2, 14(a6) +; RV32I-NEXT: lbu t3, 15(a6) +; RV32I-NEXT: lbu t4, 12(a6) +; RV32I-NEXT: lbu t5, 13(a6) +; RV32I-NEXT: lbu t6, 10(a6) +; RV32I-NEXT: lbu s0, 11(a6) +; RV32I-NEXT: lbu s1, 8(a6) +; RV32I-NEXT: lbu s2, 9(a6) +; RV32I-NEXT: lbu s3, 22(a6) +; RV32I-NEXT: lbu s4, 23(a6) +; RV32I-NEXT: lbu s5, 20(a6) +; RV32I-NEXT: lbu s6, 21(a6) +; RV32I-NEXT: lbu s7, 18(a6) +; RV32I-NEXT: lbu s8, 19(a6) +; RV32I-NEXT: lbu s9, 16(a6) +; RV32I-NEXT: lbu s10, 17(a6) +; RV32I-NEXT: lbu s11, 30(a6) +; RV32I-NEXT: lbu ra, 31(a6) +; RV32I-NEXT: lbu a5, 28(a6) +; RV32I-NEXT: lbu a4, 29(a6) +; RV32I-NEXT: lbu a0, 25(a6) +; RV32I-NEXT: lbu a1, 24(a6) +; RV32I-NEXT: lbu a3, 27(a6) +; RV32I-NEXT: lbu a6, 26(a6) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) +; RV32I-NEXT: sb a6, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) +; RV32I-NEXT: sb a5, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) @@ -2172,8 +2172,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv t1, a1 -; RV64I-NEXT: lbu t0, 31(a0) +; RV64I-NEXT: mv t0, a1 +; RV64I-NEXT: lbu t1, 31(a0) ; RV64I-NEXT: lbu a1, 0(a0) ; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a1, 1(a0) @@ -2211,15 +2211,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu a1, 30(a0) ; RV64I-NEXT: lbu a3, 29(a0) ; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: lbu t1, 0(t1) +; RV64I-NEXT: lbu t0, 0(t0) ; RV64I-NEXT: sb a1, 86(sp) ; RV64I-NEXT: sb a3, 85(sp) ; RV64I-NEXT: sb a0, 84(sp) ; RV64I-NEXT: sb a4, 83(sp) ; RV64I-NEXT: sb a5, 82(sp) ; RV64I-NEXT: sb a6, 81(sp) -; RV64I-NEXT: sb t0, 87(sp) -; RV64I-NEXT: slli t0, t0, 56 +; RV64I-NEXT: sb t1, 87(sp) +; RV64I-NEXT: slli t1, t1, 56 ; RV64I-NEXT: sb a7, 80(sp) ; RV64I-NEXT: sb ra, 79(sp) ; RV64I-NEXT: sb s11, 78(sp) @@ -2251,7 +2251,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t0, 63 +; RV64I-NEXT: srai a0, t1, 63 ; RV64I-NEXT: sb a0, 112(sp) ; RV64I-NEXT: sb a0, 104(sp) ; RV64I-NEXT: sb a0, 96(sp) @@ -2291,52 +2291,52 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a6, 91(sp) ; RV64I-NEXT: sb a7, 90(sp) ; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: andi a0, t1, 31 +; RV64I-NEXT: andi a0, t0, 31 ; RV64I-NEXT: addi a1, sp, 56 -; RV64I-NEXT: add a5, a1, a0 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: add a6, a1, a0 +; RV64I-NEXT: lbu a0, 8(a6) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 9(a6) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 10(a6) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 11(a6) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 12(a6) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) +; RV64I-NEXT: lbu a7, 13(a6) +; RV64I-NEXT: lbu t0, 14(a6) +; RV64I-NEXT: lbu t1, 15(a6) +; RV64I-NEXT: lbu t2, 0(a6) +; RV64I-NEXT: lbu t3, 1(a6) +; RV64I-NEXT: lbu t4, 2(a6) +; RV64I-NEXT: lbu t5, 3(a6) +; RV64I-NEXT: lbu t6, 4(a6) +; RV64I-NEXT: lbu s0, 5(a6) +; RV64I-NEXT: lbu s1, 6(a6) +; RV64I-NEXT: lbu s2, 7(a6) +; RV64I-NEXT: lbu s3, 24(a6) +; RV64I-NEXT: lbu s4, 25(a6) +; RV64I-NEXT: lbu s5, 26(a6) +; RV64I-NEXT: lbu s6, 27(a6) +; RV64I-NEXT: lbu s7, 28(a6) +; RV64I-NEXT: lbu s8, 29(a6) +; RV64I-NEXT: lbu s9, 30(a6) +; RV64I-NEXT: lbu s10, 31(a6) +; RV64I-NEXT: lbu s11, 16(a6) +; RV64I-NEXT: lbu ra, 17(a6) +; RV64I-NEXT: lbu a5, 18(a6) +; RV64I-NEXT: lbu a4, 19(a6) +; RV64I-NEXT: lbu a0, 23(a6) +; RV64I-NEXT: lbu a1, 22(a6) +; RV64I-NEXT: lbu a3, 21(a6) +; RV64I-NEXT: lbu a6, 20(a6) ; RV64I-NEXT: sb a0, 23(a2) ; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: sb a6, 20(a2) ; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: sb a5, 18(a2) ; RV64I-NEXT: sb ra, 17(a2) ; RV64I-NEXT: sb s11, 16(a2) ; RV64I-NEXT: sb s10, 31(a2) @@ -2400,8 +2400,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv t1, a1 -; RV32I-NEXT: lbu t0, 31(a0) +; RV32I-NEXT: mv t0, a1 +; RV32I-NEXT: lbu t1, 31(a0) ; RV32I-NEXT: lbu a1, 0(a0) ; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a1, 1(a0) @@ -2439,15 +2439,15 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a1, 30(a0) ; RV32I-NEXT: lbu a3, 29(a0) ; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: lbu t1, 0(t1) +; RV32I-NEXT: lbu t0, 0(t0) ; RV32I-NEXT: sb a1, 58(sp) ; RV32I-NEXT: sb a3, 57(sp) ; RV32I-NEXT: sb a0, 56(sp) ; RV32I-NEXT: sb a4, 55(sp) ; RV32I-NEXT: sb a5, 54(sp) ; RV32I-NEXT: sb a6, 53(sp) -; RV32I-NEXT: sb t0, 59(sp) -; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: sb t1, 59(sp) +; RV32I-NEXT: slli t1, t1, 24 ; RV32I-NEXT: sb a7, 52(sp) ; RV32I-NEXT: sb ra, 51(sp) ; RV32I-NEXT: sb s11, 50(sp) @@ -2479,7 +2479,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 29(sp) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload ; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: srai a0, t0, 31 +; RV32I-NEXT: srai a0, t1, 31 ; RV32I-NEXT: sb a0, 88(sp) ; RV32I-NEXT: sb a0, 84(sp) ; RV32I-NEXT: sb a0, 80(sp) @@ -2515,52 +2515,52 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a1, 63(sp) ; RV32I-NEXT: sb a3, 62(sp) ; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: andi a0, t1, 31 +; RV32I-NEXT: andi a0, t0, 31 ; RV32I-NEXT: addi a1, sp, 28 -; RV32I-NEXT: add a5, a1, a0 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: add a6, a1, a0 +; RV32I-NEXT: lbu a0, 6(a6) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 7(a6) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 4(a6) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 5(a6) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 0(a6) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) +; RV32I-NEXT: lbu a7, 1(a6) +; RV32I-NEXT: lbu t0, 2(a6) +; RV32I-NEXT: lbu t1, 3(a6) +; RV32I-NEXT: lbu t2, 14(a6) +; RV32I-NEXT: lbu t3, 15(a6) +; RV32I-NEXT: lbu t4, 12(a6) +; RV32I-NEXT: lbu t5, 13(a6) +; RV32I-NEXT: lbu t6, 10(a6) +; RV32I-NEXT: lbu s0, 11(a6) +; RV32I-NEXT: lbu s1, 8(a6) +; RV32I-NEXT: lbu s2, 9(a6) +; RV32I-NEXT: lbu s3, 22(a6) +; RV32I-NEXT: lbu s4, 23(a6) +; RV32I-NEXT: lbu s5, 20(a6) +; RV32I-NEXT: lbu s6, 21(a6) +; RV32I-NEXT: lbu s7, 18(a6) +; RV32I-NEXT: lbu s8, 19(a6) +; RV32I-NEXT: lbu s9, 16(a6) +; RV32I-NEXT: lbu s10, 17(a6) +; RV32I-NEXT: lbu s11, 30(a6) +; RV32I-NEXT: lbu ra, 31(a6) +; RV32I-NEXT: lbu a5, 28(a6) +; RV32I-NEXT: lbu a4, 29(a6) +; RV32I-NEXT: lbu a0, 25(a6) +; RV32I-NEXT: lbu a1, 24(a6) +; RV32I-NEXT: lbu a3, 27(a6) +; RV32I-NEXT: lbu a6, 26(a6) ; RV32I-NEXT: sb a0, 25(a2) ; RV32I-NEXT: sb a1, 24(a2) ; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) +; RV32I-NEXT: sb a6, 26(a2) ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) +; RV32I-NEXT: sb a5, 28(a2) ; RV32I-NEXT: sb ra, 31(a2) ; RV32I-NEXT: sb s11, 30(a2) ; RV32I-NEXT: sb s10, 17(a2) diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index ace54fa6bf03d..a601256bc2afa 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -781,9 +781,9 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: andi a5, a1, 7 -; RV32I-NEXT: srl a0, a4, a5 +; RV32I-NEXT: or a5, a4, a0 +; RV32I-NEXT: andi a4, a1, 7 +; RV32I-NEXT: srl a0, a5, a4 ; RV32I-NEXT: lbu a1, 9(a3) ; RV32I-NEXT: lbu a6, 8(a3) ; RV32I-NEXT: lbu a7, 10(a3) @@ -795,7 +795,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a5 +; RV32I-NEXT: not a7, a4 ; RV32I-NEXT: sll a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu a7, 1(a3) @@ -808,12 +808,12 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a5 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: xori t0, a5, 31 -; RV32I-NEXT: sll a4, a4, t0 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: srl a6, a6, a5 +; RV32I-NEXT: srl a7, a7, a4 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: xori t0, a4, 31 +; RV32I-NEXT: sll a5, a5, t0 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: srl a6, a6, a4 ; RV32I-NEXT: lbu t1, 13(a3) ; RV32I-NEXT: lbu t2, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) @@ -827,19 +827,19 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t1, a3, 1 ; RV32I-NEXT: sll t0, t1, t0 ; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: srl a3, a3, a5 +; RV32I-NEXT: srl a3, a3, a4 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb a7, 0(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: srli a5, a6, 8 -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: srli a5, a3, 24 -; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: srli a4, a6, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a4, a6, 8 +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(a2) ; RV32I-NEXT: srli a3, a7, 16 @@ -852,8 +852,8 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: srli a0, t0, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 3(a2) ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: sb a1, 7(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1064,9 +1064,9 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: andi a5, a1, 7 -; RV32I-NEXT: sll a0, a4, a5 +; RV32I-NEXT: or a5, a4, a0 +; RV32I-NEXT: andi a4, a1, 7 +; RV32I-NEXT: sll a0, a5, a4 ; RV32I-NEXT: lbu a1, 1(a3) ; RV32I-NEXT: lbu a6, 0(a3) ; RV32I-NEXT: lbu a7, 2(a3) @@ -1078,7 +1078,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: srli a1, a6, 1 -; RV32I-NEXT: xori a7, a5, 31 +; RV32I-NEXT: xori a7, a4, 31 ; RV32I-NEXT: srl a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu t0, 13(a3) @@ -1091,7 +1091,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t3, t3, 24 ; RV32I-NEXT: or t1, t3, t2 ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: sll t0, t0, a5 +; RV32I-NEXT: sll t0, t0, a4 ; RV32I-NEXT: lbu t1, 9(a3) ; RV32I-NEXT: lbu t2, 8(a3) ; RV32I-NEXT: lbu t3, 10(a3) @@ -1105,13 +1105,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli t1, a3, 1 ; RV32I-NEXT: srl a7, t1, a7 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: sll a3, a3, a5 -; RV32I-NEXT: srli a4, a4, 1 -; RV32I-NEXT: not t1, a5 -; RV32I-NEXT: srl a4, a4, t1 -; RV32I-NEXT: or a4, a3, a4 -; RV32I-NEXT: sll a5, a6, a5 -; RV32I-NEXT: sb a5, 0(a2) +; RV32I-NEXT: sll a3, a3, a4 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: not t1, a4 +; RV32I-NEXT: srl a5, a5, t1 +; RV32I-NEXT: or a5, a3, a5 +; RV32I-NEXT: sll a4, a6, a4 +; RV32I-NEXT: sb a4, 0(a2) ; RV32I-NEXT: srli a6, a3, 16 ; RV32I-NEXT: sb a6, 10(a2) ; RV32I-NEXT: srli a6, a3, 24 @@ -1124,19 +1124,19 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a3, 15(a2) ; RV32I-NEXT: srli a3, t0, 8 ; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: srli a3, a4, 16 ; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a5, 24 +; RV32I-NEXT: srli a3, a4, 24 ; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 1(a2) ; RV32I-NEXT: srli a3, a0, 16 ; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: sb a3, 7(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: sb a5, 8(a2) ; RV32I-NEXT: sb a7, 12(a2) ; RV32I-NEXT: sb a1, 4(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1353,9 +1353,9 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: andi a5, a1, 7 -; RV32I-NEXT: srl a0, a4, a5 +; RV32I-NEXT: or a5, a4, a0 +; RV32I-NEXT: andi a4, a1, 7 +; RV32I-NEXT: srl a0, a5, a4 ; RV32I-NEXT: lbu a1, 9(a3) ; RV32I-NEXT: lbu a6, 8(a3) ; RV32I-NEXT: lbu a7, 10(a3) @@ -1367,7 +1367,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a6, t0, a7 ; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a5 +; RV32I-NEXT: not a7, a4 ; RV32I-NEXT: sll a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: lbu a7, 1(a3) @@ -1380,12 +1380,12 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a5 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: xori t0, a5, 31 -; RV32I-NEXT: sll a4, a4, t0 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: srl a6, a6, a5 +; RV32I-NEXT: srl a7, a7, a4 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: xori t0, a4, 31 +; RV32I-NEXT: sll a5, a5, t0 +; RV32I-NEXT: or a5, a7, a5 +; RV32I-NEXT: srl a6, a6, a4 ; RV32I-NEXT: lbu t1, 13(a3) ; RV32I-NEXT: lbu t2, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) @@ -1399,19 +1399,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli t1, a3, 1 ; RV32I-NEXT: sll t0, t1, t0 ; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: sra a3, a3, a5 +; RV32I-NEXT: sra a3, a3, a4 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb a7, 0(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: srli a5, a6, 8 -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: srli a5, a3, 24 -; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: srli a4, a6, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a4, a6, 8 +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(a2) ; RV32I-NEXT: srli a3, a7, 16 @@ -1424,8 +1424,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: srli a0, t0, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 3(a2) ; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: sb a1, 7(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload @@ -1497,13 +1497,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s9, s11, s9 -; RV64I-NEXT: lbu s11, 4(a1) +; RV64I-NEXT: or s11, s11, s9 +; RV64I-NEXT: lbu s9, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s11 -; RV64I-NEXT: lbu s11, 21(a0) +; RV64I-NEXT: or s10, s10, s9 +; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: slli ra, ra, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, ra @@ -1511,8 +1511,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 23(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s9 -; RV64I-NEXT: lbu s9, 24(a0) +; RV64I-NEXT: or t0, a1, s11 +; RV64I-NEXT: lbu s11, 24(a0) ; RV64I-NEXT: lbu a7, 25(a0) ; RV64I-NEXT: lbu a6, 26(a0) ; RV64I-NEXT: lbu a5, 27(a0) @@ -1527,10 +1527,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 83(sp) ; RV64I-NEXT: sb a6, 82(sp) ; RV64I-NEXT: sb a7, 81(sp) -; RV64I-NEXT: sb s9, 80(sp) +; RV64I-NEXT: sb s11, 80(sp) ; RV64I-NEXT: sb s10, 79(sp) ; RV64I-NEXT: sb ra, 78(sp) -; RV64I-NEXT: sb s11, 77(sp) +; RV64I-NEXT: sb s9, 77(sp) ; RV64I-NEXT: sb s8, 76(sp) ; RV64I-NEXT: sb s7, 75(sp) ; RV64I-NEXT: sb s6, 74(sp) @@ -1688,24 +1688,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: xori t0, a1, 63 ; RV64I-NEXT: sll t1, t1, t0 ; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a3, a7 -; RV64I-NEXT: slli a7, a3, 1 -; RV64I-NEXT: sll a7, a7, t0 -; RV64I-NEXT: srl a4, a4, a1 -; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: or a7, a3, a7 +; RV64I-NEXT: slli a3, a7, 1 +; RV64I-NEXT: sll t0, a3, t0 +; RV64I-NEXT: srl a3, a4, a1 +; RV64I-NEXT: srl a4, a6, a1 ; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: srl a1, a3, a1 -; RV64I-NEXT: srli a3, a5, 48 -; RV64I-NEXT: sb a3, 22(a2) -; RV64I-NEXT: srli a3, a5, 40 -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: srli a3, a5, 32 -; RV64I-NEXT: sb a3, 20(a2) -; RV64I-NEXT: srli a3, a5, 24 -; RV64I-NEXT: sb a3, 19(a2) -; RV64I-NEXT: srli a3, a5, 16 -; RV64I-NEXT: sb a3, 18(a2) -; RV64I-NEXT: or a3, a5, a7 +; RV64I-NEXT: srl a1, a7, a1 +; RV64I-NEXT: srli a6, a5, 48 +; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a5, 40 +; RV64I-NEXT: sb a6, 21(a2) +; RV64I-NEXT: srli a6, a5, 32 +; RV64I-NEXT: sb a6, 20(a2) +; RV64I-NEXT: srli a6, a5, 24 +; RV64I-NEXT: sb a6, 19(a2) +; RV64I-NEXT: srli a6, a5, 16 +; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: or a6, a5, t0 ; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: srli a5, a5, 8 ; RV64I-NEXT: sb a5, 17(a2) @@ -1724,35 +1724,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a1, 24(a2) ; RV64I-NEXT: srli a1, a1, 8 ; RV64I-NEXT: sb a1, 25(a2) -; RV64I-NEXT: srli a1, a6, 48 +; RV64I-NEXT: srli a1, a4, 48 ; RV64I-NEXT: sb a1, 6(a2) -; RV64I-NEXT: srli a1, a6, 40 +; RV64I-NEXT: srli a1, a4, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a6, 32 +; RV64I-NEXT: srli a1, a4, 32 ; RV64I-NEXT: sb a1, 4(a2) -; RV64I-NEXT: srli a1, a6, 24 +; RV64I-NEXT: srli a1, a4, 24 ; RV64I-NEXT: sb a1, 3(a2) -; RV64I-NEXT: srli a1, a6, 16 +; RV64I-NEXT: srli a1, a4, 16 ; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: or a1, a6, t1 -; RV64I-NEXT: sb a6, 0(a2) -; RV64I-NEXT: srli a5, a6, 8 -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: srli a5, a4, 48 -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: srli a5, a4, 40 -; RV64I-NEXT: sb a5, 13(a2) -; RV64I-NEXT: srli a5, a4, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: sb a5, 11(a2) -; RV64I-NEXT: srli a5, a4, 16 -; RV64I-NEXT: sb a5, 10(a2) -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: sb a4, 8(a2) +; RV64I-NEXT: or a1, a4, t1 +; RV64I-NEXT: sb a4, 0(a2) ; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: srli a3, a3, 56 +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a4, a3, 48 +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: srli a4, a3, 40 +; RV64I-NEXT: sb a4, 13(a2) +; RV64I-NEXT: srli a4, a3, 32 +; RV64I-NEXT: sb a4, 12(a2) +; RV64I-NEXT: srli a4, a3, 24 +; RV64I-NEXT: sb a4, 11(a2) +; RV64I-NEXT: srli a4, a3, 16 +; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: or a0, a3, a0 +; RV64I-NEXT: sb a3, 8(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 9(a2) +; RV64I-NEXT: srli a3, a6, 56 ; RV64I-NEXT: sb a3, 23(a2) ; RV64I-NEXT: srli a1, a1, 56 ; RV64I-NEXT: sb a1, 7(a2) @@ -1816,21 +1816,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s5, 17(a0) ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s8, 1(a1) -; RV32I-NEXT: lbu s9, 20(a0) -; RV32I-NEXT: lbu s10, 21(a0) +; RV32I-NEXT: lbu s10, 1(a1) +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) ; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli s10, s10, 8 ; RV32I-NEXT: lbu ra, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s8, s8, s11 +; RV32I-NEXT: or s10, s10, s11 ; RV32I-NEXT: lbu s11, 22(a0) ; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, ra ; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s8 -; RV32I-NEXT: lbu s8, 24(a0) +; RV32I-NEXT: or t0, a1, s10 +; RV32I-NEXT: lbu s10, 24(a0) ; RV32I-NEXT: lbu a7, 25(a0) ; RV32I-NEXT: lbu a6, 26(a0) ; RV32I-NEXT: lbu a5, 27(a0) @@ -1845,11 +1845,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a5, 55(sp) ; RV32I-NEXT: sb a6, 54(sp) ; RV32I-NEXT: sb a7, 53(sp) -; RV32I-NEXT: sb s8, 52(sp) +; RV32I-NEXT: sb s10, 52(sp) ; RV32I-NEXT: sb ra, 51(sp) ; RV32I-NEXT: sb s11, 50(sp) -; RV32I-NEXT: sb s10, 49(sp) -; RV32I-NEXT: sb s9, 48(sp) +; RV32I-NEXT: sb s9, 49(sp) +; RV32I-NEXT: sb s8, 48(sp) ; RV32I-NEXT: sb s7, 47(sp) ; RV32I-NEXT: sb s6, 46(sp) ; RV32I-NEXT: sb s5, 45(sp) @@ -1921,7 +1921,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a5, a5, 24 ; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t4, a3, a0 +; RV32I-NEXT: or t5, a3, a0 ; RV32I-NEXT: andi a3, t0, 7 ; RV32I-NEXT: lbu a0, 9(a4) ; RV32I-NEXT: lbu a1, 8(a4) @@ -1934,69 +1934,69 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a1, a6, a5 ; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t0, a3 -; RV32I-NEXT: sll a0, a0, t0 +; RV32I-NEXT: not t1, a3 +; RV32I-NEXT: sll a0, a0, t1 ; RV32I-NEXT: lbu a1, 1(a4) ; RV32I-NEXT: lbu a5, 0(a4) ; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t1, 3(a4) +; RV32I-NEXT: lbu t0, 3(a4) ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, a7 -; RV32I-NEXT: or t1, a5, a1 -; RV32I-NEXT: slli a1, t4, 1 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or t0, a5, a1 +; RV32I-NEXT: slli a1, t5, 1 ; RV32I-NEXT: xori t2, a3, 31 ; RV32I-NEXT: sll a1, a1, t2 ; RV32I-NEXT: lbu a5, 13(a4) ; RV32I-NEXT: lbu a7, 12(a4) ; RV32I-NEXT: lbu t3, 14(a4) -; RV32I-NEXT: lbu t5, 15(a4) +; RV32I-NEXT: lbu t4, 15(a4) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a7, t5, t3 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t3, a7, a5 ; RV32I-NEXT: lbu a5, 17(a4) ; RV32I-NEXT: lbu a7, 16(a4) -; RV32I-NEXT: lbu t5, 18(a4) +; RV32I-NEXT: lbu t4, 18(a4) ; RV32I-NEXT: lbu t6, 19(a4) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t5 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: sll a7, a7, t0 -; RV32I-NEXT: lbu t5, 21(a4) +; RV32I-NEXT: or a7, t6, t4 +; RV32I-NEXT: or t4, a7, a5 +; RV32I-NEXT: slli a5, t4, 1 +; RV32I-NEXT: sll a7, a5, t1 +; RV32I-NEXT: lbu a5, 21(a4) ; RV32I-NEXT: lbu t6, 20(a4) ; RV32I-NEXT: lbu s0, 22(a4) ; RV32I-NEXT: lbu s1, 23(a4) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t5, t5, t6 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: lbu t6, 25(a4) -; RV32I-NEXT: lbu s0, 24(a4) +; RV32I-NEXT: or s0, s0, a5 +; RV32I-NEXT: lbu a5, 25(a4) +; RV32I-NEXT: lbu t6, 24(a4) ; RV32I-NEXT: lbu s1, 26(a4) ; RV32I-NEXT: lbu s2, 27(a4) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t6, t6, s0 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t6 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: lbu s0, 29(a4) +; RV32I-NEXT: or t6, s2, s1 +; RV32I-NEXT: or t6, t6, a5 +; RV32I-NEXT: lbu a5, 29(a4) ; RV32I-NEXT: lbu s1, 28(a4) ; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t0, s2, t0 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: or s0, s0, s1 +; RV32I-NEXT: sll t1, s2, t1 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, s1 ; RV32I-NEXT: lbu s1, 30(a4) ; RV32I-NEXT: lbu a4, 31(a4) ; RV32I-NEXT: slli s2, t3, 1 @@ -2004,76 +2004,76 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli a4, a4, 24 ; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: slli s1, t5, 1 +; RV32I-NEXT: slli s1, s0, 1 ; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or a4, a4, s0 -; RV32I-NEXT: slli s0, a4, 1 -; RV32I-NEXT: sll t2, s0, t2 -; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t1, t1, a3 -; RV32I-NEXT: srl t3, t3, a3 +; RV32I-NEXT: or s3, a4, a5 +; RV32I-NEXT: slli a4, s3, 1 +; RV32I-NEXT: sll t2, a4, t2 +; RV32I-NEXT: srl a4, t5, a3 +; RV32I-NEXT: srl a5, t0, a3 +; RV32I-NEXT: srl t0, t3, a3 ; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t5, t5, a3 -; RV32I-NEXT: srl a5, a5, a3 -; RV32I-NEXT: srl t6, t6, a3 -; RV32I-NEXT: srl a3, a4, a3 -; RV32I-NEXT: srli a4, t6, 16 -; RV32I-NEXT: sb a4, 26(a2) -; RV32I-NEXT: or a4, t6, t2 -; RV32I-NEXT: sb t6, 24(a2) -; RV32I-NEXT: srli t2, t6, 8 -; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a3, 24 -; RV32I-NEXT: sb t2, 31(a2) -; RV32I-NEXT: srli t2, a3, 16 -; RV32I-NEXT: sb t2, 30(a2) +; RV32I-NEXT: srl t3, s0, a3 +; RV32I-NEXT: srl t4, t4, a3 +; RV32I-NEXT: srl t5, t6, a3 +; RV32I-NEXT: srl a3, s3, a3 +; RV32I-NEXT: srli t6, t5, 16 +; RV32I-NEXT: sb t6, 26(a2) +; RV32I-NEXT: or t2, t5, t2 +; RV32I-NEXT: sb t5, 24(a2) +; RV32I-NEXT: srli t5, t5, 8 +; RV32I-NEXT: sb t5, 25(a2) +; RV32I-NEXT: srli t5, a3, 24 +; RV32I-NEXT: sb t5, 31(a2) +; RV32I-NEXT: srli t5, a3, 16 +; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: srli a3, t4, 16 ; RV32I-NEXT: sb a3, 18(a2) -; RV32I-NEXT: or s1, a5, s1 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 17(a2) -; RV32I-NEXT: srli a3, t5, 16 -; RV32I-NEXT: sb a3, 22(a2) -; RV32I-NEXT: or a3, t5, t0 -; RV32I-NEXT: sb t5, 20(a2) -; RV32I-NEXT: srli a5, t5, 8 -; RV32I-NEXT: sb a5, 21(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: or a5, a6, s2 +; RV32I-NEXT: or a3, t4, s1 +; RV32I-NEXT: sb t4, 16(a2) +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb t4, 17(a2) +; RV32I-NEXT: srli t4, t3, 16 +; RV32I-NEXT: sb t4, 22(a2) +; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: sb t3, 20(a2) +; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: sb t3, 21(a2) +; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: sb t3, 10(a2) +; RV32I-NEXT: or t3, a6, s2 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: srli a6, a6, 8 ; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t3, 16 +; RV32I-NEXT: srli a6, t0, 16 ; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t3, a7 -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: srli a7, t3, 8 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: sb t0, 12(a2) +; RV32I-NEXT: srli a7, t0, 8 ; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, t1, 16 +; RV32I-NEXT: srli a7, a5, 16 ; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: sb t1, 0(a2) -; RV32I-NEXT: srli a7, t1, 8 -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: srli a7, t4, 16 -; RV32I-NEXT: sb a7, 6(a2) -; RV32I-NEXT: or a0, t4, a0 -; RV32I-NEXT: sb t4, 4(a2) -; RV32I-NEXT: srli a7, t4, 8 -; RV32I-NEXT: sb a7, 5(a2) -; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: sb a5, 0(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: srli a4, t2, 24 ; RV32I-NEXT: sb a4, 27(a2) -; RV32I-NEXT: srli s1, s1, 24 -; RV32I-NEXT: sb s1, 19(a2) ; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: srli a3, t1, 24 ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a3, t3, 24 +; RV32I-NEXT: sb a3, 11(a2) ; RV32I-NEXT: srli a3, a6, 24 ; RV32I-NEXT: sb a3, 15(a2) ; RV32I-NEXT: srli a1, a1, 24 @@ -2155,13 +2155,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s9, s11, s9 -; RV64I-NEXT: lbu s11, 4(a1) +; RV64I-NEXT: or s11, s11, s9 +; RV64I-NEXT: lbu s9, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s11 -; RV64I-NEXT: lbu s11, 21(a0) +; RV64I-NEXT: or s10, s10, s9 +; RV64I-NEXT: lbu s9, 21(a0) ; RV64I-NEXT: slli ra, ra, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, ra @@ -2169,8 +2169,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 23(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s9 -; RV64I-NEXT: lbu s9, 24(a0) +; RV64I-NEXT: or t0, a1, s11 +; RV64I-NEXT: lbu s11, 24(a0) ; RV64I-NEXT: lbu a7, 25(a0) ; RV64I-NEXT: lbu a6, 26(a0) ; RV64I-NEXT: lbu a5, 27(a0) @@ -2185,10 +2185,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 115(sp) ; RV64I-NEXT: sb a6, 114(sp) ; RV64I-NEXT: sb a7, 113(sp) -; RV64I-NEXT: sb s9, 112(sp) +; RV64I-NEXT: sb s11, 112(sp) ; RV64I-NEXT: sb s10, 111(sp) ; RV64I-NEXT: sb ra, 110(sp) -; RV64I-NEXT: sb s11, 109(sp) +; RV64I-NEXT: sb s9, 109(sp) ; RV64I-NEXT: sb s8, 108(sp) ; RV64I-NEXT: sb s7, 107(sp) ; RV64I-NEXT: sb s6, 106(sp) @@ -2474,21 +2474,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s5, 17(a0) ; RV32I-NEXT: lbu s6, 18(a0) ; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s8, 1(a1) -; RV32I-NEXT: lbu s9, 20(a0) -; RV32I-NEXT: lbu s10, 21(a0) +; RV32I-NEXT: lbu s10, 1(a1) +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) ; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli s10, s10, 8 ; RV32I-NEXT: lbu ra, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s8, s8, s11 +; RV32I-NEXT: or s10, s10, s11 ; RV32I-NEXT: lbu s11, 22(a0) ; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, ra ; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s8 -; RV32I-NEXT: lbu s8, 24(a0) +; RV32I-NEXT: or t0, a1, s10 +; RV32I-NEXT: lbu s10, 24(a0) ; RV32I-NEXT: lbu a7, 25(a0) ; RV32I-NEXT: lbu a6, 26(a0) ; RV32I-NEXT: lbu a5, 27(a0) @@ -2503,11 +2503,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a5, 87(sp) ; RV32I-NEXT: sb a6, 86(sp) ; RV32I-NEXT: sb a7, 85(sp) -; RV32I-NEXT: sb s8, 84(sp) +; RV32I-NEXT: sb s10, 84(sp) ; RV32I-NEXT: sb ra, 83(sp) ; RV32I-NEXT: sb s11, 82(sp) -; RV32I-NEXT: sb s10, 81(sp) -; RV32I-NEXT: sb s9, 80(sp) +; RV32I-NEXT: sb s9, 81(sp) +; RV32I-NEXT: sb s8, 80(sp) ; RV32I-NEXT: sb s7, 79(sp) ; RV32I-NEXT: sb s6, 78(sp) ; RV32I-NEXT: sb s5, 77(sp) @@ -2568,125 +2568,125 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 60(sp) ; RV32I-NEXT: slli a0, t0, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a5, sp, 60 -; RV32I-NEXT: sub a5, a5, a0 -; RV32I-NEXT: lbu a0, 5(a5) -; RV32I-NEXT: lbu a1, 4(a5) -; RV32I-NEXT: lbu a3, 6(a5) -; RV32I-NEXT: lbu a4, 7(a5) +; RV32I-NEXT: addi a4, sp, 60 +; RV32I-NEXT: sub a4, a4, a0 +; RV32I-NEXT: lbu a0, 5(a4) +; RV32I-NEXT: lbu a1, 4(a4) +; RV32I-NEXT: lbu a3, 6(a4) +; RV32I-NEXT: lbu a5, 7(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or t3, a3, a0 +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or t5, a3, a0 ; RV32I-NEXT: andi a1, t0, 7 -; RV32I-NEXT: lbu a0, 1(a5) -; RV32I-NEXT: lbu a3, 0(a5) -; RV32I-NEXT: lbu a4, 2(a5) -; RV32I-NEXT: lbu a6, 3(a5) +; RV32I-NEXT: lbu a0, 1(a4) +; RV32I-NEXT: lbu a3, 0(a4) +; RV32I-NEXT: lbu a5, 2(a4) +; RV32I-NEXT: lbu a6, 3(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a6, a4 +; RV32I-NEXT: or a3, a6, a5 ; RV32I-NEXT: or a6, a3, a0 ; RV32I-NEXT: srli a0, a6, 1 ; RV32I-NEXT: xori a7, a1, 31 ; RV32I-NEXT: srl a0, a0, a7 -; RV32I-NEXT: lbu a3, 13(a5) -; RV32I-NEXT: lbu a4, 12(a5) -; RV32I-NEXT: lbu t0, 14(a5) -; RV32I-NEXT: lbu t1, 15(a5) +; RV32I-NEXT: lbu a3, 13(a4) +; RV32I-NEXT: lbu a5, 12(a4) +; RV32I-NEXT: lbu t0, 14(a4) +; RV32I-NEXT: lbu t1, 15(a4) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a4, t1, t0 -; RV32I-NEXT: or t0, a4, a3 -; RV32I-NEXT: lbu a3, 9(a5) -; RV32I-NEXT: lbu a4, 8(a5) -; RV32I-NEXT: lbu t1, 10(a5) -; RV32I-NEXT: lbu t2, 11(a5) +; RV32I-NEXT: or a5, t1, t0 +; RV32I-NEXT: or t0, a5, a3 +; RV32I-NEXT: lbu a3, 9(a4) +; RV32I-NEXT: lbu a5, 8(a4) +; RV32I-NEXT: lbu t1, 10(a4) +; RV32I-NEXT: lbu t2, 11(a4) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a4, t2, t1 -; RV32I-NEXT: or t1, a4, a3 +; RV32I-NEXT: or a5, t2, t1 +; RV32I-NEXT: or t1, a5, a3 ; RV32I-NEXT: srli a3, t1, 1 -; RV32I-NEXT: srl a3, a3, a7 -; RV32I-NEXT: srli a4, t3, 1 +; RV32I-NEXT: srl a5, a3, a7 +; RV32I-NEXT: srli t4, t5, 1 ; RV32I-NEXT: not t2, a1 -; RV32I-NEXT: lbu t4, 21(a5) -; RV32I-NEXT: lbu t5, 20(a5) -; RV32I-NEXT: lbu t6, 22(a5) -; RV32I-NEXT: lbu s0, 23(a5) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t4, t4, t5 +; RV32I-NEXT: lbu a3, 21(a4) +; RV32I-NEXT: lbu t3, 20(a4) +; RV32I-NEXT: lbu t6, 22(a4) +; RV32I-NEXT: lbu s0, 23(a4) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, t3 ; RV32I-NEXT: slli t6, t6, 16 ; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: lbu t5, 17(a5) -; RV32I-NEXT: lbu t6, 16(a5) -; RV32I-NEXT: lbu s0, 18(a5) -; RV32I-NEXT: lbu s1, 19(a5) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t5, t5, t6 +; RV32I-NEXT: or t3, s0, t6 +; RV32I-NEXT: or t3, t3, a3 +; RV32I-NEXT: lbu a3, 17(a4) +; RV32I-NEXT: lbu t6, 16(a4) +; RV32I-NEXT: lbu s0, 18(a4) +; RV32I-NEXT: lbu s1, 19(a4) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: lbu t6, 29(a5) -; RV32I-NEXT: lbu s0, 28(a5) -; RV32I-NEXT: lbu s1, 30(a5) -; RV32I-NEXT: lbu s2, 31(a5) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t6, t6, s0 +; RV32I-NEXT: or s0, s0, a3 +; RV32I-NEXT: lbu a3, 29(a4) +; RV32I-NEXT: lbu t6, 28(a4) +; RV32I-NEXT: lbu s1, 30(a4) +; RV32I-NEXT: lbu s2, 31(a4) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, t6 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 25(a5) -; RV32I-NEXT: lbu s2, 24(a5) -; RV32I-NEXT: srl a4, a4, t2 -; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: or t6, s2, s1 +; RV32I-NEXT: lbu s1, 25(a4) +; RV32I-NEXT: lbu s2, 24(a4) +; RV32I-NEXT: srl t4, t4, t2 +; RV32I-NEXT: or t6, t6, a3 ; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or s0, s1, s2 -; RV32I-NEXT: lbu s1, 26(a5) -; RV32I-NEXT: lbu a5, 27(a5) -; RV32I-NEXT: srli s2, t5, 1 +; RV32I-NEXT: or a3, s1, s2 +; RV32I-NEXT: lbu s1, 26(a4) +; RV32I-NEXT: lbu a4, 27(a4) +; RV32I-NEXT: srli s2, s0, 1 ; RV32I-NEXT: srl s2, s2, a7 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a5, a5, s1 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a4, a4, s1 ; RV32I-NEXT: srli s1, t0, 1 ; RV32I-NEXT: srl s1, s1, t2 -; RV32I-NEXT: or a5, a5, s0 -; RV32I-NEXT: srli s0, a5, 1 -; RV32I-NEXT: srl a7, s0, a7 -; RV32I-NEXT: srli s0, t4, 1 -; RV32I-NEXT: srl t2, s0, t2 -; RV32I-NEXT: sll t3, t3, a1 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: srli a3, a4, 1 +; RV32I-NEXT: srl a7, a3, a7 +; RV32I-NEXT: srli a3, t3, 1 +; RV32I-NEXT: srl t2, a3, t2 +; RV32I-NEXT: sll a3, t5, a1 ; RV32I-NEXT: sll t0, t0, a1 ; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t4, t4, a1 -; RV32I-NEXT: sll t5, t5, a1 +; RV32I-NEXT: sll t3, t3, a1 +; RV32I-NEXT: sll t5, s0, a1 ; RV32I-NEXT: sll t6, t6, a1 -; RV32I-NEXT: sll a5, a5, a1 +; RV32I-NEXT: sll a4, a4, a1 ; RV32I-NEXT: sll a1, a6, a1 -; RV32I-NEXT: srli a6, a5, 24 +; RV32I-NEXT: srli a6, a4, 24 ; RV32I-NEXT: sb a6, 27(a2) -; RV32I-NEXT: srli a6, a5, 16 +; RV32I-NEXT: srli a6, a4, 16 ; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: or a6, a5, t2 -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 25(a2) -; RV32I-NEXT: srli a5, t6, 24 -; RV32I-NEXT: sb a5, 31(a2) -; RV32I-NEXT: srli a5, t6, 16 -; RV32I-NEXT: sb a5, 30(a2) -; RV32I-NEXT: or a5, t6, a7 +; RV32I-NEXT: or a6, a4, t2 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, t6, 24 +; RV32I-NEXT: sb a4, 31(a2) +; RV32I-NEXT: srli a4, t6, 16 +; RV32I-NEXT: sb a4, 30(a2) +; RV32I-NEXT: or a4, t6, a7 ; RV32I-NEXT: srli a7, t6, 8 ; RV32I-NEXT: sb a7, 29(a2) ; RV32I-NEXT: srli a7, t5, 24 @@ -2696,25 +2696,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a7, t5, s1 ; RV32I-NEXT: srli t2, t5, 8 ; RV32I-NEXT: sb t2, 17(a2) -; RV32I-NEXT: srli t2, t4, 24 +; RV32I-NEXT: srli t2, t3, 24 ; RV32I-NEXT: sb t2, 23(a2) -; RV32I-NEXT: srli t2, t4, 16 +; RV32I-NEXT: srli t2, t3, 16 ; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: or t2, t4, s2 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 21(a2) -; RV32I-NEXT: srli t4, t1, 24 -; RV32I-NEXT: sb t4, 11(a2) -; RV32I-NEXT: srli t4, t1, 16 -; RV32I-NEXT: sb t4, 10(a2) -; RV32I-NEXT: or a4, t1, a4 +; RV32I-NEXT: or t2, t3, s2 +; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: sb t3, 21(a2) +; RV32I-NEXT: srli t3, t1, 24 +; RV32I-NEXT: sb t3, 11(a2) +; RV32I-NEXT: srli t3, t1, 16 +; RV32I-NEXT: sb t3, 10(a2) +; RV32I-NEXT: or t3, t1, t4 ; RV32I-NEXT: srli t1, t1, 8 ; RV32I-NEXT: sb t1, 9(a2) ; RV32I-NEXT: srli t1, t0, 24 ; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli t1, t0, 16 ; RV32I-NEXT: sb t1, 14(a2) -; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: or a5, t0, a5 ; RV32I-NEXT: srli t0, t0, 8 ; RV32I-NEXT: sb t0, 13(a2) ; RV32I-NEXT: srli t0, a1, 24 @@ -2724,19 +2724,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, t3, 24 +; RV32I-NEXT: srli a1, a3, 24 ; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: srli a1, t3, 16 +; RV32I-NEXT: srli a1, a3, 16 ; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: or a0, t3, a0 -; RV32I-NEXT: srli a1, t3, 8 -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 5(a2) ; RV32I-NEXT: sb a6, 24(a2) -; RV32I-NEXT: sb a5, 28(a2) +; RV32I-NEXT: sb a4, 28(a2) ; RV32I-NEXT: sb a7, 16(a2) ; RV32I-NEXT: sb t2, 20(a2) -; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb t3, 8(a2) +; RV32I-NEXT: sb a5, 12(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload @@ -2776,7 +2776,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t0, 31(a0) +; RV64I-NEXT: lbu t1, 31(a0) ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 1(a0) @@ -2789,31 +2789,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 5(a0) ; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t2, 6(a0) -; RV64I-NEXT: lbu t3, 7(a0) -; RV64I-NEXT: lbu t4, 8(a0) -; RV64I-NEXT: lbu t5, 9(a0) -; RV64I-NEXT: lbu t6, 10(a0) -; RV64I-NEXT: lbu s0, 11(a0) -; RV64I-NEXT: lbu s1, 12(a0) -; RV64I-NEXT: lbu s2, 13(a0) -; RV64I-NEXT: lbu s3, 14(a0) -; RV64I-NEXT: lbu s4, 15(a0) -; RV64I-NEXT: lbu s5, 16(a0) -; RV64I-NEXT: lbu s6, 17(a0) -; RV64I-NEXT: lbu s7, 18(a0) -; RV64I-NEXT: lbu s8, 19(a0) -; RV64I-NEXT: lbu s9, 1(a1) +; RV64I-NEXT: lbu t3, 6(a0) +; RV64I-NEXT: lbu t4, 7(a0) +; RV64I-NEXT: lbu t5, 8(a0) +; RV64I-NEXT: lbu t6, 9(a0) +; RV64I-NEXT: lbu s0, 10(a0) +; RV64I-NEXT: lbu s1, 11(a0) +; RV64I-NEXT: lbu s2, 12(a0) +; RV64I-NEXT: lbu s3, 13(a0) +; RV64I-NEXT: lbu s4, 14(a0) +; RV64I-NEXT: lbu s5, 15(a0) +; RV64I-NEXT: lbu s6, 16(a0) +; RV64I-NEXT: lbu s7, 17(a0) +; RV64I-NEXT: lbu s8, 18(a0) +; RV64I-NEXT: lbu s9, 19(a0) +; RV64I-NEXT: lbu a3, 1(a1) ; RV64I-NEXT: lbu s10, 0(a1) ; RV64I-NEXT: lbu s11, 2(a1) ; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s9, s9, s10 +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, s10 ; RV64I-NEXT: slli s11, s11, 16 ; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) ; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s9, s11, s9 +; RV64I-NEXT: or a3, s11, a3 ; RV64I-NEXT: lbu s11, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 ; RV64I-NEXT: lbu ra, 6(a1) @@ -2827,8 +2827,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 22(a0) ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t1, a1, s9 -; RV64I-NEXT: lbu s9, 23(a0) +; RV64I-NEXT: or t2, a1, a3 +; RV64I-NEXT: lbu t0, 23(a0) ; RV64I-NEXT: lbu a7, 24(a0) ; RV64I-NEXT: lbu a6, 25(a0) ; RV64I-NEXT: lbu a5, 26(a0) @@ -2843,26 +2843,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a5, 82(sp) ; RV64I-NEXT: sb a6, 81(sp) ; RV64I-NEXT: sb a7, 80(sp) -; RV64I-NEXT: sb s9, 79(sp) +; RV64I-NEXT: sb t0, 79(sp) ; RV64I-NEXT: sb s10, 78(sp) ; RV64I-NEXT: sb ra, 77(sp) ; RV64I-NEXT: sb s11, 76(sp) -; RV64I-NEXT: sb s8, 75(sp) -; RV64I-NEXT: sb s7, 74(sp) -; RV64I-NEXT: sb s6, 73(sp) -; RV64I-NEXT: sb s5, 72(sp) -; RV64I-NEXT: sb s4, 71(sp) -; RV64I-NEXT: sb s3, 70(sp) -; RV64I-NEXT: sb s2, 69(sp) -; RV64I-NEXT: sb s1, 68(sp) -; RV64I-NEXT: sb s0, 67(sp) -; RV64I-NEXT: sb t6, 66(sp) -; RV64I-NEXT: sb t5, 65(sp) -; RV64I-NEXT: sb t4, 64(sp) -; RV64I-NEXT: sb t0, 87(sp) -; RV64I-NEXT: slli t0, t0, 56 -; RV64I-NEXT: sb t3, 63(sp) -; RV64I-NEXT: sb t2, 62(sp) +; RV64I-NEXT: sb s9, 75(sp) +; RV64I-NEXT: sb s8, 74(sp) +; RV64I-NEXT: sb s7, 73(sp) +; RV64I-NEXT: sb s6, 72(sp) +; RV64I-NEXT: sb s5, 71(sp) +; RV64I-NEXT: sb s4, 70(sp) +; RV64I-NEXT: sb s3, 69(sp) +; RV64I-NEXT: sb s2, 68(sp) +; RV64I-NEXT: sb s1, 67(sp) +; RV64I-NEXT: sb s0, 66(sp) +; RV64I-NEXT: sb t6, 65(sp) +; RV64I-NEXT: sb t5, 64(sp) +; RV64I-NEXT: sb t1, 87(sp) +; RV64I-NEXT: slli t1, t1, 56 +; RV64I-NEXT: sb t4, 63(sp) +; RV64I-NEXT: sb t3, 62(sp) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 61(sp) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload @@ -2875,7 +2875,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a0, 57(sp) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t0, 63 +; RV64I-NEXT: srai a0, t1, 63 ; RV64I-NEXT: sb a0, 112(sp) ; RV64I-NEXT: sb a0, 104(sp) ; RV64I-NEXT: sb a0, 96(sp) @@ -2915,47 +2915,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a6, 91(sp) ; RV64I-NEXT: sb a7, 90(sp) ; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: slli a0, t1, 56 +; RV64I-NEXT: slli a0, t2, 56 ; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a3, sp, 56 -; RV64I-NEXT: add a3, a3, a0 -; RV64I-NEXT: lbu a0, 9(a3) -; RV64I-NEXT: lbu a1, 8(a3) -; RV64I-NEXT: lbu a4, 10(a3) -; RV64I-NEXT: lbu a5, 11(a3) +; RV64I-NEXT: addi a1, sp, 56 +; RV64I-NEXT: add a1, a1, a0 +; RV64I-NEXT: lbu a0, 9(a1) +; RV64I-NEXT: lbu a3, 8(a1) +; RV64I-NEXT: lbu a4, 10(a1) +; RV64I-NEXT: lbu a5, 11(a1) ; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a5, a5, 24 ; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a1, 13(a3) -; RV64I-NEXT: lbu a4, 12(a3) -; RV64I-NEXT: lbu a5, 14(a3) -; RV64I-NEXT: lbu a6, 15(a3) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: lbu a3, 13(a1) +; RV64I-NEXT: lbu a4, 12(a1) +; RV64I-NEXT: lbu a5, 14(a1) +; RV64I-NEXT: lbu a6, 15(a1) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a4, a1, a0 -; RV64I-NEXT: andi a1, t1, 7 -; RV64I-NEXT: lbu a0, 17(a3) -; RV64I-NEXT: lbu a5, 16(a3) -; RV64I-NEXT: lbu a6, 18(a3) -; RV64I-NEXT: lbu a7, 19(a3) +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: or a4, a3, a0 +; RV64I-NEXT: andi a3, t2, 7 +; RV64I-NEXT: lbu a0, 17(a1) +; RV64I-NEXT: lbu a5, 16(a1) +; RV64I-NEXT: lbu a6, 18(a1) +; RV64I-NEXT: lbu a7, 19(a1) ; RV64I-NEXT: slli a0, a0, 8 ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 ; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a3) -; RV64I-NEXT: lbu a6, 20(a3) -; RV64I-NEXT: lbu a7, 22(a3) -; RV64I-NEXT: lbu t0, 23(a3) +; RV64I-NEXT: lbu a5, 21(a1) +; RV64I-NEXT: lbu a6, 20(a1) +; RV64I-NEXT: lbu a7, 22(a1) +; RV64I-NEXT: lbu t0, 23(a1) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 @@ -2965,22 +2965,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli a5, a5, 32 ; RV64I-NEXT: or a5, a5, a0 ; RV64I-NEXT: slli a0, a5, 1 -; RV64I-NEXT: not a6, a1 +; RV64I-NEXT: not a6, a3 ; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a3) -; RV64I-NEXT: lbu a7, 0(a3) -; RV64I-NEXT: lbu t0, 2(a3) -; RV64I-NEXT: lbu t1, 3(a3) +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t1, 3(a1) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a3) -; RV64I-NEXT: lbu t0, 4(a3) -; RV64I-NEXT: lbu t1, 6(a3) -; RV64I-NEXT: lbu t2, 7(a3) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu t2, 7(a1) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 @@ -2989,98 +2989,98 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a3) -; RV64I-NEXT: lbu t0, 24(a3) -; RV64I-NEXT: lbu t1, 26(a3) -; RV64I-NEXT: lbu t2, 27(a3) +; RV64I-NEXT: lbu a7, 25(a1) +; RV64I-NEXT: lbu t0, 24(a1) +; RV64I-NEXT: lbu t1, 26(a1) +; RV64I-NEXT: lbu t2, 27(a1) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a3) -; RV64I-NEXT: lbu t1, 28(a3) -; RV64I-NEXT: lbu t2, 30(a3) -; RV64I-NEXT: lbu a3, 31(a3) +; RV64I-NEXT: lbu t0, 29(a1) +; RV64I-NEXT: lbu t1, 28(a1) +; RV64I-NEXT: lbu t2, 30(a1) +; RV64I-NEXT: lbu a1, 31(a1) ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a3, a3, t2 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: or a3, a3, t0 -; RV64I-NEXT: xori t0, a1, 63 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: xori t0, a3, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a3, a7 -; RV64I-NEXT: slli a7, a3, 1 -; RV64I-NEXT: sll a7, a7, t0 -; RV64I-NEXT: srl a4, a4, a1 -; RV64I-NEXT: srl a6, a6, a1 -; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: sra a1, a3, a1 -; RV64I-NEXT: srli a3, a5, 48 -; RV64I-NEXT: sb a3, 22(a2) -; RV64I-NEXT: srli a3, a5, 40 -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: srli a3, a5, 32 -; RV64I-NEXT: sb a3, 20(a2) -; RV64I-NEXT: srli a3, a5, 24 -; RV64I-NEXT: sb a3, 19(a2) -; RV64I-NEXT: srli a3, a5, 16 -; RV64I-NEXT: sb a3, 18(a2) -; RV64I-NEXT: or a3, a5, a7 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a7, a1, a7 +; RV64I-NEXT: slli a1, a7, 1 +; RV64I-NEXT: sll t0, a1, t0 +; RV64I-NEXT: srl a1, a4, a3 +; RV64I-NEXT: srl a4, a6, a3 +; RV64I-NEXT: srl a5, a5, a3 +; RV64I-NEXT: sra a3, a7, a3 +; RV64I-NEXT: srli a6, a5, 48 +; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, a5, 40 +; RV64I-NEXT: sb a6, 21(a2) +; RV64I-NEXT: srli a6, a5, 32 +; RV64I-NEXT: sb a6, 20(a2) +; RV64I-NEXT: srli a6, a5, 24 +; RV64I-NEXT: sb a6, 19(a2) +; RV64I-NEXT: srli a6, a5, 16 +; RV64I-NEXT: sb a6, 18(a2) +; RV64I-NEXT: or a6, a5, t0 ; RV64I-NEXT: sb a5, 16(a2) ; RV64I-NEXT: srli a5, a5, 8 ; RV64I-NEXT: sb a5, 17(a2) -; RV64I-NEXT: srli a5, a1, 56 +; RV64I-NEXT: srli a5, a3, 56 ; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a1, 48 +; RV64I-NEXT: srli a5, a3, 48 ; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: srli a5, a1, 40 +; RV64I-NEXT: srli a5, a3, 40 ; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: srli a5, a3, 32 ; RV64I-NEXT: sb a5, 28(a2) -; RV64I-NEXT: srli a5, a1, 24 +; RV64I-NEXT: srli a5, a3, 24 ; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, a1, 16 +; RV64I-NEXT: srli a5, a3, 16 ; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a1, 24(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 25(a2) -; RV64I-NEXT: srli a1, a6, 48 -; RV64I-NEXT: sb a1, 6(a2) -; RV64I-NEXT: srli a1, a6, 40 -; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a6, 32 -; RV64I-NEXT: sb a1, 4(a2) -; RV64I-NEXT: srli a1, a6, 24 -; RV64I-NEXT: sb a1, 3(a2) -; RV64I-NEXT: srli a1, a6, 16 -; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: or a1, a6, t1 -; RV64I-NEXT: sb a6, 0(a2) -; RV64I-NEXT: srli a5, a6, 8 -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: srli a5, a4, 48 -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: srli a5, a4, 40 -; RV64I-NEXT: sb a5, 13(a2) -; RV64I-NEXT: srli a5, a4, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: sb a5, 11(a2) -; RV64I-NEXT: srli a5, a4, 16 -; RV64I-NEXT: sb a5, 10(a2) -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: sb a4, 8(a2) +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a3, a4, 48 +; RV64I-NEXT: sb a3, 6(a2) +; RV64I-NEXT: srli a3, a4, 40 +; RV64I-NEXT: sb a3, 5(a2) +; RV64I-NEXT: srli a3, a4, 32 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: srli a3, a4, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, a4, 16 +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: or a3, a4, t1 +; RV64I-NEXT: sb a4, 0(a2) ; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a4, 9(a2) +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a4, a1, 48 +; RV64I-NEXT: sb a4, 14(a2) +; RV64I-NEXT: srli a4, a1, 40 +; RV64I-NEXT: sb a4, 13(a2) +; RV64I-NEXT: srli a4, a1, 32 +; RV64I-NEXT: sb a4, 12(a2) +; RV64I-NEXT: srli a4, a1, 24 +; RV64I-NEXT: sb a4, 11(a2) +; RV64I-NEXT: srli a4, a1, 16 +; RV64I-NEXT: sb a4, 10(a2) +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: srli a1, a6, 56 +; RV64I-NEXT: sb a1, 23(a2) ; RV64I-NEXT: srli a3, a3, 56 -; RV64I-NEXT: sb a3, 23(a2) -; RV64I-NEXT: srli a1, a1, 56 -; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: sb a3, 7(a2) ; RV64I-NEXT: srli a0, a0, 56 ; RV64I-NEXT: sb a0, 15(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload @@ -3141,20 +3141,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu s6, 16(a0) ; RV32I-NEXT: lbu s7, 17(a0) ; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu a4, 1(a1) +; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu s9, 19(a0) ; RV32I-NEXT: lbu s10, 20(a0) ; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: lbu ra, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or a4, a4, s11 +; RV32I-NEXT: or a3, a3, s11 ; RV32I-NEXT: lbu s11, 21(a0) ; RV32I-NEXT: slli ra, ra, 16 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, ra ; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: or t1, a1, a4 +; RV32I-NEXT: or t1, a1, a3 ; RV32I-NEXT: lbu t0, 23(a0) ; RV32I-NEXT: lbu a7, 24(a0) ; RV32I-NEXT: lbu a6, 25(a0) @@ -3240,23 +3240,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sb a0, 61(sp) ; RV32I-NEXT: slli a0, t1, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a3, sp, 28 -; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a1, 4(a3) -; RV32I-NEXT: lbu a4, 6(a3) -; RV32I-NEXT: lbu a5, 7(a3) +; RV32I-NEXT: addi a4, sp, 28 +; RV32I-NEXT: add a4, a4, a0 +; RV32I-NEXT: lbu a0, 5(a4) +; RV32I-NEXT: lbu a1, 4(a4) +; RV32I-NEXT: lbu a3, 6(a4) +; RV32I-NEXT: lbu a5, 7(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: or t4, a4, a0 -; RV32I-NEXT: andi a4, t1, 7 -; RV32I-NEXT: lbu a0, 9(a3) -; RV32I-NEXT: lbu a1, 8(a3) -; RV32I-NEXT: lbu a5, 10(a3) -; RV32I-NEXT: lbu a6, 11(a3) +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or t5, a3, a0 +; RV32I-NEXT: andi a3, t1, 7 +; RV32I-NEXT: lbu a0, 9(a4) +; RV32I-NEXT: lbu a1, 8(a4) +; RV32I-NEXT: lbu a5, 10(a4) +; RV32I-NEXT: lbu a6, 11(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a5, a5, 16 @@ -3264,146 +3264,146 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: or a1, a6, a5 ; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t0, a4 -; RV32I-NEXT: sll a0, a0, t0 -; RV32I-NEXT: lbu a1, 1(a3) -; RV32I-NEXT: lbu a5, 0(a3) -; RV32I-NEXT: lbu a7, 2(a3) -; RV32I-NEXT: lbu t1, 3(a3) +; RV32I-NEXT: not t1, a3 +; RV32I-NEXT: sll a0, a0, t1 +; RV32I-NEXT: lbu a1, 1(a4) +; RV32I-NEXT: lbu a5, 0(a4) +; RV32I-NEXT: lbu a7, 2(a4) +; RV32I-NEXT: lbu t0, 3(a4) ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, a7 -; RV32I-NEXT: or t1, a5, a1 -; RV32I-NEXT: slli a1, t4, 1 -; RV32I-NEXT: xori t2, a4, 31 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or t0, a5, a1 +; RV32I-NEXT: slli a1, t5, 1 +; RV32I-NEXT: xori t2, a3, 31 ; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: lbu a5, 13(a3) -; RV32I-NEXT: lbu a7, 12(a3) -; RV32I-NEXT: lbu t3, 14(a3) -; RV32I-NEXT: lbu t5, 15(a3) +; RV32I-NEXT: lbu a5, 13(a4) +; RV32I-NEXT: lbu a7, 12(a4) +; RV32I-NEXT: lbu t3, 14(a4) +; RV32I-NEXT: lbu t4, 15(a4) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a7, t5, t3 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or a7, t4, t3 ; RV32I-NEXT: or t3, a7, a5 -; RV32I-NEXT: lbu a5, 17(a3) -; RV32I-NEXT: lbu a7, 16(a3) -; RV32I-NEXT: lbu t5, 18(a3) -; RV32I-NEXT: lbu t6, 19(a3) +; RV32I-NEXT: lbu a5, 17(a4) +; RV32I-NEXT: lbu a7, 16(a4) +; RV32I-NEXT: lbu t4, 18(a4) +; RV32I-NEXT: lbu t6, 19(a4) ; RV32I-NEXT: slli a5, a5, 8 ; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t5 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: sll a7, a7, t0 -; RV32I-NEXT: lbu t5, 21(a3) -; RV32I-NEXT: lbu t6, 20(a3) -; RV32I-NEXT: lbu s0, 22(a3) -; RV32I-NEXT: lbu s1, 23(a3) -; RV32I-NEXT: slli t5, t5, 8 -; RV32I-NEXT: or t5, t5, t6 +; RV32I-NEXT: or a7, t6, t4 +; RV32I-NEXT: or t4, a7, a5 +; RV32I-NEXT: slli a5, t4, 1 +; RV32I-NEXT: sll a7, a5, t1 +; RV32I-NEXT: lbu a5, 21(a4) +; RV32I-NEXT: lbu t6, 20(a4) +; RV32I-NEXT: lbu s0, 22(a4) +; RV32I-NEXT: lbu s1, 23(a4) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: lbu t6, 25(a3) -; RV32I-NEXT: lbu s0, 24(a3) -; RV32I-NEXT: lbu s1, 26(a3) -; RV32I-NEXT: lbu s2, 27(a3) -; RV32I-NEXT: slli t6, t6, 8 -; RV32I-NEXT: or t6, t6, s0 +; RV32I-NEXT: or s0, s0, a5 +; RV32I-NEXT: lbu a5, 25(a4) +; RV32I-NEXT: lbu t6, 24(a4) +; RV32I-NEXT: lbu s1, 26(a4) +; RV32I-NEXT: lbu s2, 27(a4) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t6 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: lbu s0, 29(a3) -; RV32I-NEXT: lbu s1, 28(a3) +; RV32I-NEXT: or t6, s2, s1 +; RV32I-NEXT: or t6, t6, a5 +; RV32I-NEXT: lbu a5, 29(a4) +; RV32I-NEXT: lbu s1, 28(a4) ; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t0, s2, t0 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: or s0, s0, s1 -; RV32I-NEXT: lbu s1, 30(a3) -; RV32I-NEXT: lbu a3, 31(a3) +; RV32I-NEXT: sll t1, s2, t1 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, s1 +; RV32I-NEXT: lbu s1, 30(a4) +; RV32I-NEXT: lbu a4, 31(a4) ; RV32I-NEXT: slli s2, t3, 1 ; RV32I-NEXT: sll s2, s2, t2 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, s1 -; RV32I-NEXT: slli s1, t5, 1 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: slli s1, s0, 1 ; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or a3, a3, s0 -; RV32I-NEXT: slli s0, a3, 1 -; RV32I-NEXT: sll t2, s0, t2 -; RV32I-NEXT: srl t4, t4, a4 -; RV32I-NEXT: srl t1, t1, a4 -; RV32I-NEXT: srl t3, t3, a4 -; RV32I-NEXT: srl a6, a6, a4 -; RV32I-NEXT: srl t5, t5, a4 -; RV32I-NEXT: srl a5, a5, a4 -; RV32I-NEXT: srl t6, t6, a4 -; RV32I-NEXT: sra a3, a3, a4 -; RV32I-NEXT: srli a4, t6, 16 -; RV32I-NEXT: sb a4, 26(a2) -; RV32I-NEXT: or a4, t6, t2 -; RV32I-NEXT: sb t6, 24(a2) -; RV32I-NEXT: srli t2, t6, 8 -; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a3, 24 -; RV32I-NEXT: sb t2, 31(a2) -; RV32I-NEXT: srli t2, a3, 16 -; RV32I-NEXT: sb t2, 30(a2) +; RV32I-NEXT: or s3, a4, a5 +; RV32I-NEXT: slli a4, s3, 1 +; RV32I-NEXT: sll t2, a4, t2 +; RV32I-NEXT: srl a4, t5, a3 +; RV32I-NEXT: srl a5, t0, a3 +; RV32I-NEXT: srl t0, t3, a3 +; RV32I-NEXT: srl a6, a6, a3 +; RV32I-NEXT: srl t3, s0, a3 +; RV32I-NEXT: srl t4, t4, a3 +; RV32I-NEXT: srl t5, t6, a3 +; RV32I-NEXT: sra a3, s3, a3 +; RV32I-NEXT: srli t6, t5, 16 +; RV32I-NEXT: sb t6, 26(a2) +; RV32I-NEXT: or t2, t5, t2 +; RV32I-NEXT: sb t5, 24(a2) +; RV32I-NEXT: srli t5, t5, 8 +; RV32I-NEXT: sb t5, 25(a2) +; RV32I-NEXT: srli t5, a3, 24 +; RV32I-NEXT: sb t5, 31(a2) +; RV32I-NEXT: srli t5, a3, 16 +; RV32I-NEXT: sb t5, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: srli a3, t4, 16 ; RV32I-NEXT: sb a3, 18(a2) -; RV32I-NEXT: or s1, a5, s1 -; RV32I-NEXT: sb a5, 16(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 17(a2) -; RV32I-NEXT: srli a3, t5, 16 -; RV32I-NEXT: sb a3, 22(a2) -; RV32I-NEXT: or a3, t5, t0 -; RV32I-NEXT: sb t5, 20(a2) -; RV32I-NEXT: srli a5, t5, 8 -; RV32I-NEXT: sb a5, 21(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: or a5, a6, s2 +; RV32I-NEXT: or a3, t4, s1 +; RV32I-NEXT: sb t4, 16(a2) +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb t4, 17(a2) +; RV32I-NEXT: srli t4, t3, 16 +; RV32I-NEXT: sb t4, 22(a2) +; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: sb t3, 20(a2) +; RV32I-NEXT: srli t3, t3, 8 +; RV32I-NEXT: sb t3, 21(a2) +; RV32I-NEXT: srli t3, a6, 16 +; RV32I-NEXT: sb t3, 10(a2) +; RV32I-NEXT: or t3, a6, s2 ; RV32I-NEXT: sb a6, 8(a2) ; RV32I-NEXT: srli a6, a6, 8 ; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t3, 16 +; RV32I-NEXT: srli a6, t0, 16 ; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t3, a7 -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: srli a7, t3, 8 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: sb t0, 12(a2) +; RV32I-NEXT: srli a7, t0, 8 ; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, t1, 16 +; RV32I-NEXT: srli a7, a5, 16 ; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: sb t1, 0(a2) -; RV32I-NEXT: srli a7, t1, 8 -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: srli a7, t4, 16 -; RV32I-NEXT: sb a7, 6(a2) -; RV32I-NEXT: or a0, t4, a0 -; RV32I-NEXT: sb t4, 4(a2) -; RV32I-NEXT: srli a7, t4, 8 -; RV32I-NEXT: sb a7, 5(a2) -; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: sb a5, 0(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: sb a5, 6(a2) +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: srli a4, t2, 24 ; RV32I-NEXT: sb a4, 27(a2) -; RV32I-NEXT: srli s1, s1, 24 -; RV32I-NEXT: sb s1, 19(a2) ; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: sb a3, 19(a2) +; RV32I-NEXT: srli a3, t1, 24 ; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a3, t3, 24 +; RV32I-NEXT: sb a3, 11(a2) ; RV32I-NEXT: srli a3, a6, 24 ; RV32I-NEXT: sb a3, 15(a2) ; RV32I-NEXT: srli a1, a1, 24 diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll index e274671f88757..ae1de443bce05 100644 --- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll @@ -11,27 +11,27 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: .cfi_window_save ; SPARC-NEXT: .cfi_register %o7, %i7 ; SPARC-NEXT: ld [%fp+96], %l1 -; SPARC-NEXT: mov %i3, %l0 +; SPARC-NEXT: mov %i3, %g4 ; SPARC-NEXT: mov %i2, %g2 ; SPARC-NEXT: umul %i3, %l1, %i3 ; SPARC-NEXT: rd %y, %i2 ; SPARC-NEXT: ld [%fp+92], %l2 ; SPARC-NEXT: umul %g2, %l1, %g3 -; SPARC-NEXT: rd %y, %g4 +; SPARC-NEXT: rd %y, %l0 ; SPARC-NEXT: addcc %g3, %i2, %i2 -; SPARC-NEXT: addxcc %g4, 0, %g3 -; SPARC-NEXT: umul %l0, %l2, %g4 +; SPARC-NEXT: addxcc %l0, 0, %g3 +; SPARC-NEXT: umul %g4, %l2, %l0 ; SPARC-NEXT: rd %y, %l3 -; SPARC-NEXT: addcc %g4, %i2, %i2 -; SPARC-NEXT: addxcc %l3, 0, %g4 -; SPARC-NEXT: addcc %g3, %g4, %g3 -; SPARC-NEXT: addxcc %g0, 0, %g4 +; SPARC-NEXT: addcc %l0, %i2, %i2 +; SPARC-NEXT: addxcc %l3, 0, %l0 +; SPARC-NEXT: addcc %g3, %l0, %g3 +; SPARC-NEXT: addxcc %g0, 0, %l0 ; SPARC-NEXT: umul %g2, %l2, %l3 ; SPARC-NEXT: rd %y, %l4 ; SPARC-NEXT: addcc %l3, %g3, %g3 ; SPARC-NEXT: umul %i1, %l1, %l3 ; SPARC-NEXT: rd %y, %l5 -; SPARC-NEXT: addxcc %l4, %g4, %g4 +; SPARC-NEXT: addxcc %l4, %l0, %l0 ; SPARC-NEXT: umul %i0, %l1, %l4 ; SPARC-NEXT: rd %y, %l6 ; SPARC-NEXT: addcc %l4, %l5, %l4 @@ -47,16 +47,16 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addcc %l7, %l5, %l5 ; SPARC-NEXT: addxcc %o0, %l6, %l6 ; SPARC-NEXT: addcc %l3, %g3, %g3 -; SPARC-NEXT: addxcc %l4, %g4, %g4 +; SPARC-NEXT: addxcc %l4, %l0, %l0 ; SPARC-NEXT: addxcc %l5, 0, %l3 -; SPARC-NEXT: umul %l0, %i5, %l4 +; SPARC-NEXT: umul %g4, %i5, %l4 ; SPARC-NEXT: rd %y, %l5 ; SPARC-NEXT: addxcc %l6, 0, %l6 ; SPARC-NEXT: umul %g2, %i5, %l7 ; SPARC-NEXT: rd %y, %o0 ; SPARC-NEXT: addcc %l7, %l5, %l5 ; SPARC-NEXT: addxcc %o0, 0, %l7 -; SPARC-NEXT: umul %l0, %i4, %o0 +; SPARC-NEXT: umul %g4, %i4, %o0 ; SPARC-NEXT: rd %y, %o1 ; SPARC-NEXT: addcc %o0, %l5, %l5 ; SPARC-NEXT: addxcc %o1, 0, %o0 @@ -67,7 +67,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addcc %o1, %l7, %l7 ; SPARC-NEXT: addxcc %o2, %o0, %o0 ; SPARC-NEXT: addcc %l4, %g3, %g3 -; SPARC-NEXT: addxcc %l5, %g4, %g4 +; SPARC-NEXT: addxcc %l5, %l0, %l0 ; SPARC-NEXT: addxcc %l7, 0, %l4 ; SPARC-NEXT: addxcc %o0, 0, %l5 ; SPARC-NEXT: addcc %l3, %l4, %l3 @@ -118,21 +118,21 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %o0, %o3, %l6 ; SPARC-NEXT: addcc %l2, %o1, %l2 ; SPARC-NEXT: sra %i4, 31, %i4 -; SPARC-NEXT: umul %l0, %i4, %l0 +; SPARC-NEXT: umul %g4, %i4, %g4 ; SPARC-NEXT: rd %y, %o0 ; SPARC-NEXT: addxcc %l6, %l7, %l6 ; SPARC-NEXT: umul %i4, %g2, %g2 ; SPARC-NEXT: rd %y, %l7 -; SPARC-NEXT: add %o0, %l0, %o1 +; SPARC-NEXT: add %o0, %g4, %o1 ; SPARC-NEXT: smul %i0, %i4, %i0 ; SPARC-NEXT: umul %i1, %i4, %i1 ; SPARC-NEXT: rd %y, %i4 ; SPARC-NEXT: add %o1, %g2, %o1 ; SPARC-NEXT: add %i4, %i1, %i4 ; SPARC-NEXT: add %i4, %i0, %i0 -; SPARC-NEXT: addcc %i1, %l0, %i1 +; SPARC-NEXT: addcc %i1, %g4, %i1 ; SPARC-NEXT: addxcc %i0, %o1, %i0 -; SPARC-NEXT: addcc %l0, %o0, %i4 +; SPARC-NEXT: addcc %g4, %o0, %i4 ; SPARC-NEXT: addxcc %o0, 0, %o0 ; SPARC-NEXT: addcc %g2, %i4, %i4 ; SPARC-NEXT: addxcc %l7, 0, %o1 @@ -142,7 +142,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %l7, %o1, %l7 ; SPARC-NEXT: addcc %g2, %i1, %i1 ; SPARC-NEXT: addxcc %l7, %i0, %i0 -; SPARC-NEXT: addcc %l0, %l1, %g2 +; SPARC-NEXT: addcc %g4, %l1, %g2 ; SPARC-NEXT: addxcc %i4, %o2, %i4 ; SPARC-NEXT: addxcc %i1, %l2, %i1 ; SPARC-NEXT: addxcc %i0, %l6, %i0 @@ -150,7 +150,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: addxcc %l4, %i4, %i4 ; SPARC-NEXT: addxcc %l5, %i1, %i1 ; SPARC-NEXT: addxcc %i5, %i0, %i0 -; SPARC-NEXT: sra %g4, 31, %i5 +; SPARC-NEXT: sra %l0, 31, %i5 ; SPARC-NEXT: xor %i0, %i5, %i0 ; SPARC-NEXT: xor %i4, %i5, %i4 ; SPARC-NEXT: or %i4, %i0, %i0 @@ -167,7 +167,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: .LBB0_2: ; SPARC-NEXT: mov 1, %i4 ; SPARC-NEXT: .LBB0_3: ! %start -; SPARC-NEXT: mov %g4, %i0 +; SPARC-NEXT: mov %l0, %i0 ; SPARC-NEXT: ret ; SPARC-NEXT: restore %g0, %g3, %o1 ; @@ -226,13 +226,13 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC64-NEXT: mov %g0, %o2 ; SPARC64-NEXT: call __multi3 ; SPARC64-NEXT: mov %i2, %o3 -; SPARC64-NEXT: srlx %o1, 32, %i3 -; SPARC64-NEXT: srlx %o0, 32, %g2 -; SPARC64-NEXT: addcc %o1, %i5, %i5 -; SPARC64-NEXT: addxcc %i3, %i4, %i3 -; SPARC64-NEXT: addxcc %o0, 0, %i4 -; SPARC64-NEXT: addxcc %g2, 0, %g2 -; SPARC64-NEXT: addcc %l4, %i4, %i4 +; SPARC64-NEXT: srlx %o1, 32, %g2 +; SPARC64-NEXT: srlx %o0, 32, %g3 +; SPARC64-NEXT: addcc %o1, %i5, %i3 +; SPARC64-NEXT: addxcc %g2, %i4, %i4 +; SPARC64-NEXT: addxcc %o0, 0, %i5 +; SPARC64-NEXT: addxcc %g3, 0, %g2 +; SPARC64-NEXT: addcc %l4, %i5, %i5 ; SPARC64-NEXT: addxcc %l5, %g2, %l4 ; SPARC64-NEXT: addxcc %g0, 0, %l5 ; SPARC64-NEXT: addxcc %g0, 0, %l6 @@ -243,29 +243,29 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC64-NEXT: mov %i2, %o3 ; SPARC64-NEXT: mov %g0, %i2 ; SPARC64-NEXT: srlx %o1, 32, %i0 -; SPARC64-NEXT: addcc %o1, %i4, %i4 +; SPARC64-NEXT: addcc %o1, %i5, %i5 ; SPARC64-NEXT: srlx %o0, 32, %g2 ; SPARC64-NEXT: addxcc %i0, %l4, %i0 ; SPARC64-NEXT: addxcc %o0, %l5, %g3 ; SPARC64-NEXT: addxcc %g2, %l6, %g2 -; SPARC64-NEXT: addcc %i4, %l0, %i4 +; SPARC64-NEXT: addcc %i5, %l0, %i5 ; SPARC64-NEXT: addxcc %i0, %l1, %i0 ; SPARC64-NEXT: addxcc %g3, %l2, %g3 ; SPARC64-NEXT: addxcc %g2, %l3, %g2 ; SPARC64-NEXT: srl %g3, 0, %g3 ; SPARC64-NEXT: sllx %g2, 32, %g2 ; SPARC64-NEXT: or %g2, %g3, %g2 -; SPARC64-NEXT: sllx %i3, 32, %i3 -; SPARC64-NEXT: srax %i3, 63, %g3 +; SPARC64-NEXT: sllx %i4, 32, %i4 +; SPARC64-NEXT: srax %i4, 63, %g3 ; SPARC64-NEXT: xor %g2, %g3, %g2 -; SPARC64-NEXT: srl %i4, 0, %i4 +; SPARC64-NEXT: srl %i5, 0, %i5 ; SPARC64-NEXT: sllx %i0, 32, %i0 -; SPARC64-NEXT: or %i0, %i4, %i0 +; SPARC64-NEXT: or %i0, %i5, %i0 ; SPARC64-NEXT: xor %i0, %g3, %i0 ; SPARC64-NEXT: or %i0, %g2, %i0 ; SPARC64-NEXT: movrnz %i0, 1, %i2 -; SPARC64-NEXT: srl %i5, 0, %i0 -; SPARC64-NEXT: or %i3, %i0, %i0 +; SPARC64-NEXT: srl %i3, 0, %i0 +; SPARC64-NEXT: or %i4, %i0, %i0 ; SPARC64-NEXT: srl %i2, 0, %i2 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll index 257e9723f2cf3..9ca895fe78073 100644 --- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll @@ -13,147 +13,147 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; SPARC-NEXT: mov %i3, %g2 ; SPARC-NEXT: mov %i2, %g4 ; SPARC-NEXT: umul %i2, %i5, %i2 -; SPARC-NEXT: rd %y, %o0 -; SPARC-NEXT: ld [%fp+92], %l6 +; SPARC-NEXT: rd %y, %l7 +; SPARC-NEXT: ld [%fp+92], %l4 ; SPARC-NEXT: umul %i4, %i3, %i3 -; SPARC-NEXT: rd %y, %o2 +; SPARC-NEXT: rd %y, %o1 ; SPARC-NEXT: ld [%fp+96], %g3 -; SPARC-NEXT: umul %i5, %g2, %l0 -; SPARC-NEXT: rd %y, %l7 -; SPARC-NEXT: umul %l6, %i1, %l2 -; SPARC-NEXT: rd %y, %l3 +; SPARC-NEXT: umul %i5, %g2, %l3 +; SPARC-NEXT: rd %y, %o0 +; SPARC-NEXT: umul %l4, %i1, %l2 +; SPARC-NEXT: rd %y, %l1 ; SPARC-NEXT: add %i3, %i2, %i2 ; SPARC-NEXT: umul %i0, %g3, %i3 -; SPARC-NEXT: rd %y, %l5 -; SPARC-NEXT: add %l7, %i2, %o1 +; SPARC-NEXT: rd %y, %l6 +; SPARC-NEXT: add %o0, %i2, %o2 ; SPARC-NEXT: umul %i1, %g3, %i2 -; SPARC-NEXT: rd %y, %l1 +; SPARC-NEXT: rd %y, %l0 ; SPARC-NEXT: add %i3, %l2, %i3 -; SPARC-NEXT: add %l1, %i3, %l2 -; SPARC-NEXT: addcc %i2, %l0, %l0 +; SPARC-NEXT: add %l0, %i3, %l2 +; SPARC-NEXT: addcc %i2, %l3, %l3 ; SPARC-NEXT: umul %g2, %g3, %i3 ; SPARC-NEXT: rd %y, %i2 -; SPARC-NEXT: addxcc %l2, %o1, %o4 +; SPARC-NEXT: addxcc %l2, %o2, %o4 ; SPARC-NEXT: umul %g4, %g3, %g3 -; SPARC-NEXT: rd %y, %l4 +; SPARC-NEXT: rd %y, %l5 ; SPARC-NEXT: addcc %g3, %i2, %i2 -; SPARC-NEXT: addxcc %l4, 0, %g3 -; SPARC-NEXT: umul %g2, %l6, %g2 -; SPARC-NEXT: rd %y, %l4 +; SPARC-NEXT: addxcc %l5, 0, %g3 +; SPARC-NEXT: umul %g2, %l4, %g2 +; SPARC-NEXT: rd %y, %l5 ; SPARC-NEXT: addcc %g2, %i2, %i2 -; SPARC-NEXT: addxcc %l4, 0, %g2 +; SPARC-NEXT: addxcc %l5, 0, %g2 ; SPARC-NEXT: addcc %g3, %g2, %g2 ; SPARC-NEXT: addxcc %g0, 0, %g3 -; SPARC-NEXT: umul %g4, %l6, %l4 +; SPARC-NEXT: umul %g4, %l4, %l5 ; SPARC-NEXT: rd %y, %o3 -; SPARC-NEXT: addcc %l4, %g2, %l4 +; SPARC-NEXT: addcc %l5, %g2, %l5 ; SPARC-NEXT: addxcc %o3, %g3, %o3 -; SPARC-NEXT: addcc %l4, %l0, %g2 +; SPARC-NEXT: addcc %l5, %l3, %g2 ; SPARC-NEXT: addxcc %o3, %o4, %g3 -; SPARC-NEXT: mov 1, %l0 +; SPARC-NEXT: mov 1, %l3 ; SPARC-NEXT: cmp %g3, %o3 ; SPARC-NEXT: bcs .LBB0_2 -; SPARC-NEXT: mov %l0, %o4 +; SPARC-NEXT: mov %l3, %o4 ; SPARC-NEXT: ! %bb.1: ! %start ; SPARC-NEXT: mov %g0, %o4 ; SPARC-NEXT: .LBB0_2: ! %start -; SPARC-NEXT: cmp %g2, %l4 +; SPARC-NEXT: cmp %g2, %l5 ; SPARC-NEXT: bcs .LBB0_4 -; SPARC-NEXT: mov %l0, %l4 +; SPARC-NEXT: mov %l3, %l5 ; SPARC-NEXT: ! %bb.3: ! %start -; SPARC-NEXT: mov %g0, %l4 +; SPARC-NEXT: mov %g0, %l5 ; SPARC-NEXT: .LBB0_4: ! %start ; SPARC-NEXT: cmp %g3, %o3 ; SPARC-NEXT: be .LBB0_6 ; SPARC-NEXT: nop ; SPARC-NEXT: ! %bb.5: ! %start -; SPARC-NEXT: mov %o4, %l4 +; SPARC-NEXT: mov %o4, %l5 ; SPARC-NEXT: .LBB0_6: ! %start ; SPARC-NEXT: cmp %g4, 0 ; SPARC-NEXT: bne .LBB0_8 -; SPARC-NEXT: mov %l0, %g4 +; SPARC-NEXT: mov %l3, %o3 ; SPARC-NEXT: ! %bb.7: ! %start -; SPARC-NEXT: mov %g0, %g4 +; SPARC-NEXT: mov %g0, %o3 ; SPARC-NEXT: .LBB0_8: ! %start ; SPARC-NEXT: cmp %i4, 0 ; SPARC-NEXT: bne .LBB0_10 -; SPARC-NEXT: mov %l0, %o3 +; SPARC-NEXT: mov %l3, %o4 ; SPARC-NEXT: ! %bb.9: ! %start -; SPARC-NEXT: mov %g0, %o3 +; SPARC-NEXT: mov %g0, %o4 ; SPARC-NEXT: .LBB0_10: ! %start -; SPARC-NEXT: cmp %o2, 0 +; SPARC-NEXT: cmp %o1, 0 ; SPARC-NEXT: bne .LBB0_12 -; SPARC-NEXT: mov %l0, %o2 +; SPARC-NEXT: mov %l3, %o1 ; SPARC-NEXT: ! %bb.11: ! %start -; SPARC-NEXT: mov %g0, %o2 +; SPARC-NEXT: mov %g0, %o1 ; SPARC-NEXT: .LBB0_12: ! %start -; SPARC-NEXT: cmp %o0, 0 +; SPARC-NEXT: cmp %l7, 0 ; SPARC-NEXT: bne .LBB0_14 -; SPARC-NEXT: mov %l0, %o0 +; SPARC-NEXT: mov %l3, %l7 ; SPARC-NEXT: ! %bb.13: ! %start -; SPARC-NEXT: mov %g0, %o0 +; SPARC-NEXT: mov %g0, %l7 ; SPARC-NEXT: .LBB0_14: ! %start -; SPARC-NEXT: cmp %o1, %l7 +; SPARC-NEXT: cmp %o2, %o0 ; SPARC-NEXT: bcs .LBB0_16 -; SPARC-NEXT: mov %l0, %l7 +; SPARC-NEXT: mov %l3, %g4 ; SPARC-NEXT: ! %bb.15: ! %start -; SPARC-NEXT: mov %g0, %l7 +; SPARC-NEXT: mov %g0, %g4 ; SPARC-NEXT: .LBB0_16: ! %start -; SPARC-NEXT: cmp %l6, 0 +; SPARC-NEXT: cmp %l4, 0 ; SPARC-NEXT: bne .LBB0_18 -; SPARC-NEXT: mov %l0, %l6 +; SPARC-NEXT: mov %l3, %l4 ; SPARC-NEXT: ! %bb.17: ! %start -; SPARC-NEXT: mov %g0, %l6 +; SPARC-NEXT: mov %g0, %l4 ; SPARC-NEXT: .LBB0_18: ! %start ; SPARC-NEXT: cmp %i0, 0 ; SPARC-NEXT: bne .LBB0_20 -; SPARC-NEXT: mov %l0, %o1 +; SPARC-NEXT: mov %l3, %o0 ; SPARC-NEXT: ! %bb.19: ! %start -; SPARC-NEXT: mov %g0, %o1 +; SPARC-NEXT: mov %g0, %o0 ; SPARC-NEXT: .LBB0_20: ! %start -; SPARC-NEXT: cmp %l5, 0 +; SPARC-NEXT: cmp %l6, 0 ; SPARC-NEXT: bne .LBB0_22 -; SPARC-NEXT: mov %l0, %l5 +; SPARC-NEXT: mov %l3, %l6 ; SPARC-NEXT: ! %bb.21: ! %start -; SPARC-NEXT: mov %g0, %l5 +; SPARC-NEXT: mov %g0, %l6 ; SPARC-NEXT: .LBB0_22: ! %start -; SPARC-NEXT: and %o3, %g4, %g4 -; SPARC-NEXT: cmp %l3, 0 -; SPARC-NEXT: and %o1, %l6, %o1 +; SPARC-NEXT: and %o4, %o3, %o2 +; SPARC-NEXT: cmp %l1, 0 +; SPARC-NEXT: and %o0, %l4, %l4 ; SPARC-NEXT: bne .LBB0_24 -; SPARC-NEXT: mov %l0, %l3 +; SPARC-NEXT: mov %l3, %l1 ; SPARC-NEXT: ! %bb.23: ! %start -; SPARC-NEXT: mov %g0, %l3 +; SPARC-NEXT: mov %g0, %l1 ; SPARC-NEXT: .LBB0_24: ! %start -; SPARC-NEXT: or %g4, %o2, %l6 -; SPARC-NEXT: cmp %l2, %l1 -; SPARC-NEXT: or %o1, %l5, %l2 +; SPARC-NEXT: or %o2, %o1, %o0 +; SPARC-NEXT: cmp %l2, %l0 +; SPARC-NEXT: or %l4, %l6, %l4 ; SPARC-NEXT: bcs .LBB0_26 -; SPARC-NEXT: mov %l0, %g4 +; SPARC-NEXT: mov %l3, %l0 ; SPARC-NEXT: ! %bb.25: ! %start -; SPARC-NEXT: mov %g0, %g4 +; SPARC-NEXT: mov %g0, %l0 ; SPARC-NEXT: .LBB0_26: ! %start -; SPARC-NEXT: or %l6, %o0, %l1 +; SPARC-NEXT: or %o0, %l7, %l2 ; SPARC-NEXT: or %i5, %i4, %i4 ; SPARC-NEXT: cmp %i4, 0 -; SPARC-NEXT: or %l2, %l3, %l2 +; SPARC-NEXT: or %l4, %l1, %l1 ; SPARC-NEXT: bne .LBB0_28 -; SPARC-NEXT: mov %l0, %i4 +; SPARC-NEXT: mov %l3, %i4 ; SPARC-NEXT: ! %bb.27: ! %start ; SPARC-NEXT: mov %g0, %i4 ; SPARC-NEXT: .LBB0_28: ! %start -; SPARC-NEXT: or %l1, %l7, %i5 +; SPARC-NEXT: or %l2, %g4, %i5 ; SPARC-NEXT: or %i1, %i0, %i0 ; SPARC-NEXT: cmp %i0, 0 ; SPARC-NEXT: bne .LBB0_30 -; SPARC-NEXT: or %l2, %g4, %i0 +; SPARC-NEXT: or %l1, %l0, %i0 ; SPARC-NEXT: ! %bb.29: ! %start -; SPARC-NEXT: mov %g0, %l0 +; SPARC-NEXT: mov %g0, %l3 ; SPARC-NEXT: .LBB0_30: ! %start -; SPARC-NEXT: and %l0, %i4, %i1 +; SPARC-NEXT: and %l3, %i4, %i1 ; SPARC-NEXT: or %i1, %i0, %i0 ; SPARC-NEXT: or %i0, %i5, %i0 -; SPARC-NEXT: or %i0, %l4, %i0 +; SPARC-NEXT: or %i0, %l5, %i0 ; SPARC-NEXT: and %i0, 1, %i4 ; SPARC-NEXT: mov %g3, %i0 ; SPARC-NEXT: ret diff --git a/llvm/test/CodeGen/SPIRV/EnqueueEmptyKernel.ll b/llvm/test/CodeGen/SPIRV/EnqueueEmptyKernel.ll index 2112464de75a7..679f8ff7a0017 100644 --- a/llvm/test/CodeGen/SPIRV/EnqueueEmptyKernel.ll +++ b/llvm/test/CodeGen/SPIRV/EnqueueEmptyKernel.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ;; This test checks that Invoke parameter of OpEnueueKernel instruction meet the ;; following specification requirements in case of enqueueing empty block: diff --git a/llvm/test/CodeGen/SPIRV/SampledImageRetType.ll b/llvm/test/CodeGen/SPIRV/SampledImageRetType.ll index 00dbf6fe658bd..1aa3af83bcd20 100644 --- a/llvm/test/CodeGen/SPIRV/SampledImageRetType.ll +++ b/llvm/test/CodeGen/SPIRV/SampledImageRetType.ll @@ -1,26 +1,24 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s -%opencl.image1d_ro_t = type opaque ; CHECK: %[[#image1d_t:]] = OpTypeImage -%opencl.sampler_t = type opaque ; CHECK: %[[#sampler_t:]] = OpTypeSampler ; CHECK: %[[#sampled_image_t:]] = OpTypeSampledImage -declare dso_local spir_func i8 addrspace(4)* @_Z20__spirv_SampledImageI14ocl_image1d_roPvET0_T_11ocl_sampler(%opencl.image1d_ro_t addrspace(1)*, %opencl.sampler_t addrspace(2)*) local_unnamed_addr +declare dso_local spir_func ptr addrspace(4) @_Z20__spirv_SampledImageI14ocl_image1d_roPvET0_T_11ocl_sampler(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %0, target("spirv.Sampler") %1) local_unnamed_addr -declare dso_local spir_func <4 x float> @_Z30__spirv_ImageSampleExplicitLodIPvDv4_fiET0_T_T1_if(i8 addrspace(4)*, i32, i32, float) local_unnamed_addr +declare dso_local spir_func <4 x float> @_Z30__spirv_ImageSampleExplicitLodIPvDv4_fiET0_T_T1_if(ptr addrspace(4) %0, i32 %1, i32 %2, float %3) local_unnamed_addr @__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(2) constant <3 x i64>, align 32 -define weak_odr dso_local spir_kernel void @_ZTS17image_kernel_readILi1EE(%opencl.image1d_ro_t addrspace(1)*, %opencl.sampler_t addrspace(2)*) { +define weak_odr dso_local spir_kernel void @_ZTS17image_kernel_readILi1EE(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0), target("spirv.Sampler")) { ; CHECK: OpFunction ; CHECK: %[[#image:]] = OpFunctionParameter %[[#image1d_t]] ; CHECK: %[[#sampler:]] = OpFunctionParameter %[[#sampler_t]] - %3 = load <3 x i64>, <3 x i64> addrspace(2)* @__spirv_BuiltInGlobalInvocationId, align 32 + %3 = load <3 x i64>, ptr addrspace(2) @__spirv_BuiltInGlobalInvocationId, align 32 %4 = extractelement <3 x i64> %3, i64 0 %5 = trunc i64 %4 to i32 - %6 = tail call spir_func i8 addrspace(4)* @_Z20__spirv_SampledImageI14ocl_image1d_roPvET0_T_11ocl_sampler(%opencl.image1d_ro_t addrspace(1)* %0, %opencl.sampler_t addrspace(2)* %1) - %7 = tail call spir_func <4 x float> @_Z30__spirv_ImageSampleExplicitLodIPvDv4_fiET0_T_T1_if(i8 addrspace(4)* %6, i32 %5, i32 2, float 0.000000e+00) + %6 = call spir_func ptr addrspace(4) @_Z20__spirv_SampledImageI14ocl_image1d_roPvET0_T_11ocl_sampler(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %0, target("spirv.Sampler") %1) + %7 = call spir_func <4 x float> @_Z30__spirv_ImageSampleExplicitLodIPvDv4_fiET0_T_T1_if(ptr addrspace(4) %6, i32 %5, i32 2, float 0.000000e+00) ; CHECK: %[[#sampled_image:]] = OpSampledImage %[[#sampled_image_t]] %[[#image]] %[[#sampler]] ; CHECK: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#sampled_image]] %[[#]] {{.*}} %[[#]] diff --git a/llvm/test/CodeGen/SPIRV/atomicrmw.ll b/llvm/test/CodeGen/SPIRV/atomicrmw.ll index de192d5bd422d..401b21224e7dd 100644 --- a/llvm/test/CodeGen/SPIRV/atomicrmw.ll +++ b/llvm/test/CodeGen/SPIRV/atomicrmw.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#Int:]] = OpTypeInt 32 0 ; CHECK-DAG: %[[#Scope_Device:]] = OpConstant %[[#Int]] 1 {{$}} diff --git a/llvm/test/CodeGen/SPIRV/basic_float_types.ll b/llvm/test/CodeGen/SPIRV/basic_float_types.ll new file mode 100644 index 0000000000000..4287adc85cfd8 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/basic_float_types.ll @@ -0,0 +1,51 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s + +define void @main() { +entry: +; CHECK-DAG: %[[#float:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#double:]] = OpTypeFloat 64 + +; CHECK-DAG: %[[#v2float:]] = OpTypeVector %[[#float]] 2 +; CHECK-DAG: %[[#v3float:]] = OpTypeVector %[[#float]] 3 +; CHECK-DAG: %[[#v4float:]] = OpTypeVector %[[#float]] 4 + +; CHECK-DAG: %[[#v2double:]] = OpTypeVector %[[#double]] 2 +; CHECK-DAG: %[[#v3double:]] = OpTypeVector %[[#double]] 3 +; CHECK-DAG: %[[#v4double:]] = OpTypeVector %[[#double]] 4 + +; CHECK-DAG: %[[#ptr_Function_float:]] = OpTypePointer Function %[[#float]] +; CHECK-DAG: %[[#ptr_Function_double:]] = OpTypePointer Function %[[#double]] +; CHECK-DAG: %[[#ptr_Function_v2float:]] = OpTypePointer Function %[[#v2float]] +; CHECK-DAG: %[[#ptr_Function_v3float:]] = OpTypePointer Function %[[#v3float]] +; CHECK-DAG: %[[#ptr_Function_v4float:]] = OpTypePointer Function %[[#v4float]] +; CHECK-DAG: %[[#ptr_Function_v2double:]] = OpTypePointer Function %[[#v2double]] +; CHECK-DAG: %[[#ptr_Function_v3double:]] = OpTypePointer Function %[[#v3double]] +; CHECK-DAG: %[[#ptr_Function_v4double:]] = OpTypePointer Function %[[#v4double]] + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_float]] Function + %float_Val = alloca float, align 4 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_double]] Function + %double_Val = alloca double, align 8 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v2float]] Function + %float2_Val = alloca <2 x float>, align 8 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v3float]] Function + %float3_Val = alloca <3 x float>, align 16 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v4float]] Function + %float4_Val = alloca <4 x float>, align 16 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v2double]] Function + %double2_Val = alloca <2 x double>, align 16 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v3double]] Function + %double3_Val = alloca <3 x double>, align 32 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v4double]] Function + %double4_Val = alloca <4 x double>, align 32 + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/basic_int_types.ll b/llvm/test/CodeGen/SPIRV/basic_int_types.ll new file mode 100644 index 0000000000000..b479cee0bed71 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/basic_int_types.ll @@ -0,0 +1,73 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s + +define void @main() { +entry: +; CHECK-DAG: %[[#short:]] = OpTypeInt 16 0 +; CHECK-DAG: %[[#int:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#long:]] = OpTypeInt 64 0 + +; CHECK-DAG: %[[#v2short:]] = OpTypeVector %[[#short]] 2 +; CHECK-DAG: %[[#v3short:]] = OpTypeVector %[[#short]] 3 +; CHECK-DAG: %[[#v4short:]] = OpTypeVector %[[#short]] 4 + +; CHECK-DAG: %[[#v2int:]] = OpTypeVector %[[#int]] 2 +; CHECK-DAG: %[[#v3int:]] = OpTypeVector %[[#int]] 3 +; CHECK-DAG: %[[#v4int:]] = OpTypeVector %[[#int]] 4 + +; CHECK-DAG: %[[#v2long:]] = OpTypeVector %[[#long]] 2 +; CHECK-DAG: %[[#v3long:]] = OpTypeVector %[[#long]] 3 +; CHECK-DAG: %[[#v4long:]] = OpTypeVector %[[#long]] 4 + +; CHECK-DAG: %[[#ptr_Function_short:]] = OpTypePointer Function %[[#short]] +; CHECK-DAG: %[[#ptr_Function_int:]] = OpTypePointer Function %[[#int]] +; CHECK-DAG: %[[#ptr_Function_long:]] = OpTypePointer Function %[[#long]] +; CHECK-DAG: %[[#ptr_Function_v2short:]] = OpTypePointer Function %[[#v2short]] +; CHECK-DAG: %[[#ptr_Function_v3short:]] = OpTypePointer Function %[[#v3short]] +; CHECK-DAG: %[[#ptr_Function_v4short:]] = OpTypePointer Function %[[#v4short]] +; CHECK-DAG: %[[#ptr_Function_v2int:]] = OpTypePointer Function %[[#v2int]] +; CHECK-DAG: %[[#ptr_Function_v3int:]] = OpTypePointer Function %[[#v3int]] +; CHECK-DAG: %[[#ptr_Function_v4int:]] = OpTypePointer Function %[[#v4int]] +; CHECK-DAG: %[[#ptr_Function_v2long:]] = OpTypePointer Function %[[#v2long]] +; CHECK-DAG: %[[#ptr_Function_v3long:]] = OpTypePointer Function %[[#v3long]] +; CHECK-DAG: %[[#ptr_Function_v4long:]] = OpTypePointer Function %[[#v4long]] + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_short]] Function + %int16_t_Val = alloca i16, align 2 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_int]] Function + %int_Val = alloca i32, align 4 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_long]] Function + %int64_t_Val = alloca i64, align 8 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v2short]] Function + %int16_t2_Val = alloca <2 x i16>, align 4 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v3short]] Function + %int16_t3_Val = alloca <3 x i16>, align 8 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v4short]] Function + %int16_t4_Val = alloca <4 x i16>, align 8 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v2int]] Function + %int2_Val = alloca <2 x i32>, align 8 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v3int]] Function + %int3_Val = alloca <3 x i32>, align 16 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v4int]] Function + %int4_Val = alloca <4 x i32>, align 16 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v2long]] Function + %int64_t2_Val = alloca <2 x i64>, align 16 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v3long]] Function + %int64_t3_Val = alloca <3 x i64>, align 32 + +; CHECK: %[[#]] = OpVariable %[[#ptr_Function_v4long]] Function + %int64_t4_Val = alloca <4 x i64>, align 32 + + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/constant/global-constants.ll b/llvm/test/CodeGen/SPIRV/constant/global-constants.ll index 33eaec7c5042d..916c70628d016 100644 --- a/llvm/test/CodeGen/SPIRV/constant/global-constants.ll +++ b/llvm/test/CodeGen/SPIRV/constant/global-constants.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s @global = addrspace(1) constant i32 1 ; OpenCL global memory @constant = addrspace(2) constant i32 2 ; OpenCL constant memory diff --git a/llvm/test/CodeGen/SPIRV/constant/local-null-constants.ll b/llvm/test/CodeGen/SPIRV/constant/local-null-constants.ll index 05b738d227e72..8009f488f6dd2 100644 --- a/llvm/test/CodeGen/SPIRV/constant/local-null-constants.ll +++ b/llvm/test/CodeGen/SPIRV/constant/local-null-constants.ll @@ -1,21 +1,21 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ;; OpenCL global memory -define i32 addrspace(1)* @getConstant1() { - ret i32 addrspace(1)* null +define ptr addrspace(1) @getConstant1() { + ret ptr addrspace(1) null } ;; OpenCL constant memory -define i32 addrspace(2)* @getConstant2() { - ret i32 addrspace(2)* null +define ptr addrspace(2) @getConstant2() { + ret ptr addrspace(2) null } ;; OpenCL local memory -define i32 addrspace(3)* @getConstant3() { - ret i32 addrspace(3)* null +define ptr addrspace(3) @getConstant3() { + ret ptr addrspace(3) null } -; CHECK: [[INT:%.+]] = OpTypeInt 32 +; CHECK: [[INT:%.+]] = OpTypeInt 8 ; CHECK-DAG: [[PTR_AS1:%.+]] = OpTypePointer CrossWorkgroup [[INT]] ; CHECK-DAG: OpConstantNull [[PTR_AS1]] diff --git a/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll b/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll index d4cac793cf47d..5e76eb11de633 100644 --- a/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll +++ b/llvm/test/CodeGen/SPIRV/function/alloca-load-store.ll @@ -1,12 +1,15 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s + +target triple = "spirv32-unknown-unknown" ; CHECK-DAG: OpName %[[#BAR:]] "bar" ; CHECK-DAG: OpName %[[#FOO:]] "foo" ; CHECK-DAG: OpName %[[#GOO:]] "goo" -; CHECK: %[[#INT:]] = OpTypeInt 32 +; CHECK-DAG: %[[#CHAR:]] = OpTypeInt 8 +; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 ; CHECK-DAG: %[[#STACK_PTR:]] = OpTypePointer Function %[[#INT]] -; CHECK-DAG: %[[#GLOBAL_PTR:]] = OpTypePointer CrossWorkgroup %[[#INT]] +; CHECK-DAG: %[[#GLOBAL_PTR:]] = OpTypePointer CrossWorkgroup %[[#CHAR]] ; CHECK-DAG: %[[#FN1:]] = OpTypeFunction %[[#INT]] %[[#INT]] ; CHECK-DAG: %[[#FN2:]] = OpTypeFunction %[[#INT]] %[[#INT]] %[[#GLOBAL_PTR]] diff --git a/llvm/test/CodeGen/SPIRV/half_extension.ll b/llvm/test/CodeGen/SPIRV/half_extension.ll index af1c858cfb759..b30e5514c95be 100644 --- a/llvm/test/CodeGen/SPIRV/half_extension.ll +++ b/llvm/test/CodeGen/SPIRV/half_extension.ll @@ -7,7 +7,7 @@ ;; return y; ;; } -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-DAG: OpCapability Float16Buffer ; CHECK-SPIRV-DAG: OpCapability Float16 diff --git a/llvm/test/CodeGen/SPIRV/half_no_extension.ll b/llvm/test/CodeGen/SPIRV/half_no_extension.ll index 611d029065342..a5b0ec9c92d23 100644 --- a/llvm/test/CodeGen/SPIRV/half_no_extension.ll +++ b/llvm/test/CodeGen/SPIRV/half_no_extension.ll @@ -5,7 +5,7 @@ ;; vstorea_half4_rtp( data, 0, f ); ;; } -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: OpCapability Float16Buffer ; CHECK-SPIRV-NOT: OpCapability Float16 diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll new file mode 100644 index 0000000000000..7031129de21ac --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/abs.ll @@ -0,0 +1,27 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - | FileCheck %s + +; CHECK: OpExtInstImport "GLSL.std.450" + +define void @main() #1 { +entry: + %i = alloca i32, align 4 + %absi = alloca i32, align 4 + %f = alloca float, align 4 + %absf = alloca float, align 4 + %0 = load i32, ptr %i, align 4 + +; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] SAbs %[[#]] + %elt.abs = call i32 @llvm.abs.i32(i32 %0, i1 false) + + store i32 %elt.abs, ptr %absi, align 4 + %1 = load float, ptr %f, align 4 + +; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FAbs %[[#]] + %elt.abs1 = call float @llvm.fabs.f32(float %1) + + store float %elt.abs1, ptr %absf, align 4 + ret void +} + +declare i32 @llvm.abs.i32(i32, i1 immarg) #2 +declare float @llvm.fabs.f32(float) #2 diff --git a/llvm/test/CodeGen/SPIRV/image-unoptimized.ll b/llvm/test/CodeGen/SPIRV/image-unoptimized.ll index 5fb51565ccbe4..0ce9c73ab103f 100644 --- a/llvm/test/CodeGen/SPIRV/image-unoptimized.ll +++ b/llvm/test/CodeGen/SPIRV/image-unoptimized.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#TypeImage:]] = OpTypeImage ; CHECK: %[[#TypeSampler:]] = OpTypeSampler @@ -43,38 +43,35 @@ ;; results[tid_x + tid_y * get_image_width(srcimg)] = read_imagef(srcimg, sampler, (int2){tid_x, tid_y}); ;; } -%opencl.image2d_ro_t = type opaque -%opencl.sampler_t = type opaque - -define dso_local spir_kernel void @test_fn(%opencl.image2d_ro_t addrspace(1)* %srcimg, %opencl.sampler_t addrspace(2)* %sampler, <4 x float> addrspace(1)* noundef %results) { +define dso_local spir_kernel void @test_fn(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %srcimg, target("spirv.Sampler") %sampler, <4 x float> addrspace(1)* noundef %results) { entry: - %srcimg.addr = alloca %opencl.image2d_ro_t addrspace(1)*, align 4 - %sampler.addr = alloca %opencl.sampler_t addrspace(2)*, align 4 + %srcimg.addr = alloca target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), align 4 + %sampler.addr = alloca target("spirv.Sampler"), align 4 %results.addr = alloca <4 x float> addrspace(1)*, align 4 %tid_x = alloca i32, align 4 %tid_y = alloca i32, align 4 %.compoundliteral = alloca <2 x i32>, align 8 - store %opencl.image2d_ro_t addrspace(1)* %srcimg, %opencl.image2d_ro_t addrspace(1)** %srcimg.addr, align 4 - store %opencl.sampler_t addrspace(2)* %sampler, %opencl.sampler_t addrspace(2)** %sampler.addr, align 4 + store target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %srcimg, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)* %srcimg.addr, align 4 + store target("spirv.Sampler") %sampler, target("spirv.Sampler")* %sampler.addr, align 4 store <4 x float> addrspace(1)* %results, <4 x float> addrspace(1)** %results.addr, align 4 %call = call spir_func i32 @_Z13get_global_idj(i32 noundef 0) store i32 %call, i32* %tid_x, align 4 %call1 = call spir_func i32 @_Z13get_global_idj(i32 noundef 1) store i32 %call1, i32* %tid_y, align 4 - %0 = load %opencl.image2d_ro_t addrspace(1)*, %opencl.image2d_ro_t addrspace(1)** %srcimg.addr, align 4 - %1 = load %opencl.sampler_t addrspace(2)*, %opencl.sampler_t addrspace(2)** %sampler.addr, align 4 + %0 = load target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)* %srcimg.addr, align 4 + %1 = load target("spirv.Sampler"), target("spirv.Sampler")* %sampler.addr, align 4 %2 = load i32, i32* %tid_x, align 4 %vecinit = insertelement <2 x i32> undef, i32 %2, i32 0 %3 = load i32, i32* %tid_y, align 4 %vecinit2 = insertelement <2 x i32> %vecinit, i32 %3, i32 1 store <2 x i32> %vecinit2, <2 x i32>* %.compoundliteral, align 8 %4 = load <2 x i32>, <2 x i32>* %.compoundliteral, align 8 - %call3 = call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_i(%opencl.image2d_ro_t addrspace(1)* %0, %opencl.sampler_t addrspace(2)* %1, <2 x i32> noundef %4) + %call3 = call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %0, target("spirv.Sampler") %1, <2 x i32> noundef %4) %5 = load <4 x float> addrspace(1)*, <4 x float> addrspace(1)** %results.addr, align 4 %6 = load i32, i32* %tid_x, align 4 %7 = load i32, i32* %tid_y, align 4 - %8 = load %opencl.image2d_ro_t addrspace(1)*, %opencl.image2d_ro_t addrspace(1)** %srcimg.addr, align 4 - %call4 = call spir_func i32 @_Z15get_image_width14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)* %8) + %8 = load target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)* %srcimg.addr, align 4 + %call4 = call spir_func i32 @_Z15get_image_width14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %8) %mul = mul nsw i32 %7, %call4 %add = add nsw i32 %6, %mul %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %5, i32 %add @@ -84,6 +81,6 @@ entry: declare spir_func i32 @_Z13get_global_idj(i32 noundef) -declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_i(%opencl.image2d_ro_t addrspace(1)*, %opencl.sampler_t addrspace(2)*, <2 x i32> noundef) +declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), target("spirv.Sampler"), <2 x i32> noundef) -declare spir_func i32 @_Z15get_image_width14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)) diff --git a/llvm/test/CodeGen/SPIRV/image.ll b/llvm/test/CodeGen/SPIRV/image.ll index a852429bf05ea..8d4ab7f3cbf0e 100644 --- a/llvm/test/CodeGen/SPIRV/image.ll +++ b/llvm/test/CodeGen/SPIRV/image.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: %[[#VOID_TY:]] = OpTypeVoid ; CHECK-SPIRV-DAG: %[[#]] = OpTypeImage %[[#VOID_TY]] 2D 0 0 0 0 Unknown ReadOnly @@ -7,25 +7,22 @@ ; CHECK-SPIRV: OpImageSampleExplicitLod ; CHECK-SPIRV: OpImageWrite -%opencl.image2d_t = type opaque - -define spir_kernel void @image_copy(%opencl.image2d_t addrspace(1)* readnone %image1, %opencl.image2d_t addrspace(1)* %image2) !kernel_arg_access_qual !1 { -entry: +define spir_kernel void @image_copy(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %image1, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %image2) !kernel_arg_access_qual !1 { %call = tail call spir_func i64 @_Z13get_global_idj(i32 0) %conv = trunc i64 %call to i32 %call1 = tail call spir_func i64 @_Z13get_global_idj(i32 1) %conv2 = trunc i64 %call1 to i32 %vecinit = insertelement <2 x i32> undef, i32 %conv, i32 0 %vecinit3 = insertelement <2 x i32> %vecinit, i32 %conv2, i32 1 - %call4 = tail call spir_func <4 x float> @_Z11read_imagef11ocl_image2d11ocl_samplerDv2_i(%opencl.image2d_t addrspace(1)* %image1, i32 20, <2 x i32> %vecinit3) - tail call spir_func void @_Z12write_imagef11ocl_image2dDv2_iDv4_f(%opencl.image2d_t addrspace(1)* %image2, <2 x i32> %vecinit3, <4 x float> %call4) + %call4 = tail call spir_func <4 x float> @_Z11read_imagef11ocl_image2d11ocl_samplerDv2_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %image1, i32 20, <2 x i32> %vecinit3) + tail call spir_func void @_Z12write_imagef11ocl_image2dDv2_iDv4_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %image2, <2 x i32> %vecinit3, <4 x float> %call4) ret void } declare spir_func i64 @_Z13get_global_idj(i32) -declare spir_func <4 x float> @_Z11read_imagef11ocl_image2d11ocl_samplerDv2_i(%opencl.image2d_t addrspace(1)*, i32, <2 x i32>) +declare spir_func <4 x float> @_Z11read_imagef11ocl_image2d11ocl_samplerDv2_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), i32, <2 x i32>) -declare spir_func void @_Z12write_imagef11ocl_image2dDv2_iDv4_f(%opencl.image2d_t addrspace(1)*, <2 x i32>, <4 x float>) +declare spir_func void @_Z12write_imagef11ocl_image2dDv2_iDv4_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), <2 x i32>, <4 x float>) !1 = !{!"read_only", !"write_only"} diff --git a/llvm/test/CodeGen/SPIRV/image_decl_func_arg.ll b/llvm/test/CodeGen/SPIRV/image_decl_func_arg.ll index b461d09171b4b..a6bab92455b1e 100644 --- a/llvm/test/CodeGen/SPIRV/image_decl_func_arg.ll +++ b/llvm/test/CodeGen/SPIRV/image_decl_func_arg.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: %[[#TypeImage:]] = OpTypeImage ; CHECK-SPIRV-NOT: OpTypeImage @@ -9,20 +9,15 @@ ; CHECK-SPIRV: %[[#ParamID:]] = OpFunctionParameter %[[#TypeImage]] ; CHECK-SPIRV: %[[#]] = OpFunctionCall %[[#]] %[[#]] %[[#ParamID]] -%opencl.image2d_ro_t = type opaque - -define spir_func void @f0(%opencl.image2d_ro_t addrspace(1)* %v2, <2 x float> %v3) { -entry: +define spir_func void @f0(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %v2, <2 x float> %v3) { ret void } -define spir_func void @f1(%opencl.image2d_ro_t addrspace(1)* %v2, <2 x float> %v3) { -entry: +define spir_func void @f1(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %v2, <2 x float> %v3) { ret void } -define spir_kernel void @test(%opencl.image2d_ro_t addrspace(1)* %v1) { -entry: - call spir_func void @f0(%opencl.image2d_ro_t addrspace(1)* %v1, <2 x float> ) +define spir_kernel void @test(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %v1) { + call spir_func void @f0(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %v1, <2 x float> ) ret void } diff --git a/llvm/test/CodeGen/SPIRV/image_dim.ll b/llvm/test/CodeGen/SPIRV/image_dim.ll index 2f5f04f8f9056..7c97e87bd4734 100644 --- a/llvm/test/CodeGen/SPIRV/image_dim.ll +++ b/llvm/test/CodeGen/SPIRV/image_dim.ll @@ -1,12 +1,8 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-DAG: OpCapability Sampled1D ; CHECK-SPIRV-DAG: OpCapability SampledBuffer -%opencl.image1d_t = type opaque -%opencl.image1d_buffer_t = type opaque - -define spir_kernel void @image_d(%opencl.image1d_t addrspace(1)* %image1d_td6, %opencl.image1d_buffer_t addrspace(1)* %image1d_buffer_td8) { -entry: +define spir_kernel void @test_image_dim(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %image1d, target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 0) %image1d_buffer) { ret void } diff --git a/llvm/test/CodeGen/SPIRV/image_store.ll b/llvm/test/CodeGen/SPIRV/image_store.ll deleted file mode 100644 index 3542cc2e8e9b3..0000000000000 --- a/llvm/test/CodeGen/SPIRV/image_store.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s - -;; Image types may be represented in two ways while translating to SPIR-V: -;; - OpenCL form based on pointers-to-opaque-structs, e.g. '%opencl.image2d_ro_t', -;; - SPIR-V form based on TargetExtType, e.g. 'target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)', -;; but it is still one type which should be translated to one SPIR-V type. -;; -;; The test checks that the code below is successfully translated and only one -;; SPIR-V type for images is generated. - -; CHECK: OpTypeImage -; CHECK-NOT: OpTypeImage - -%opencl.image2d_ro_t = type opaque - -define spir_kernel void @read_image(%opencl.image2d_ro_t addrspace(1)* %srcimg) { -entry: - %srcimg.addr = alloca %opencl.image2d_ro_t addrspace(1)*, align 8 - %spirvimg.addr = alloca target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), align 8 - store %opencl.image2d_ro_t addrspace(1)* %srcimg, %opencl.image2d_ro_t addrspace(1)** %srcimg.addr, align 8 - ret void -} diff --git a/llvm/test/CodeGen/SPIRV/link-attribute.ll b/llvm/test/CodeGen/SPIRV/link-attribute.ll index c69d64debd7a5..f00c25a949b1e 100644 --- a/llvm/test/CodeGen/SPIRV/link-attribute.ll +++ b/llvm/test/CodeGen/SPIRV/link-attribute.ll @@ -1,33 +1,31 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s - -%opencl.image2d_t = type opaque +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: OpDecorate %[[#ID:]] LinkageAttributes "imageSampler" Export ; CHECK: %[[#ID]] = OpVariable %[[#]] UniformConstant %[[#]] @imageSampler = addrspace(2) constant i32 36, align 4 -define spir_kernel void @sample_kernel(%opencl.image2d_t addrspace(1)* %input, float addrspace(1)* nocapture %xOffsets, float addrspace(1)* nocapture %yOffsets, <4 x float> addrspace(1)* nocapture %results) { +define spir_kernel void @sample_kernel(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, ptr addrspace(1) nocapture %xOffsets, ptr addrspace(1) nocapture %yOffsets, ptr addrspace(1) nocapture %results) { %1 = tail call spir_func i64 @_Z13get_global_idj(i32 0) %2 = trunc i64 %1 to i32 %3 = tail call spir_func i64 @_Z13get_global_idj(i32 1) %4 = trunc i64 %3 to i32 - %5 = tail call spir_func i32 @_Z15get_image_width11ocl_image2d(%opencl.image2d_t addrspace(1)* %input) + %5 = tail call spir_func i32 @_Z15get_image_width11ocl_image2d(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input) %6 = mul nsw i32 %4, %5 %7 = add nsw i32 %6, %2 %8 = sitofp i32 %2 to float %9 = insertelement <2 x float> undef, float %8, i32 0 %10 = sitofp i32 %4 to float %11 = insertelement <2 x float> %9, float %10, i32 1 - %12 = tail call spir_func <4 x float> @_Z11read_imagef11ocl_image2d11ocl_samplerDv2_f(%opencl.image2d_t addrspace(1)* %input, i32 36, <2 x float> %11) + %12 = tail call spir_func <4 x float> @_Z11read_imagef11ocl_image2d11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, i32 36, <2 x float> %11) %13 = sext i32 %7 to i64 - %14 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %results, i64 %13 - store <4 x float> %12, <4 x float> addrspace(1)* %14, align 16 + %14 = getelementptr inbounds <4 x float>, ptr addrspace(1) %results, i64 %13 + store <4 x float> %12, ptr addrspace(1) %14, align 16 ret void } declare spir_func i64 @_Z13get_global_idj(i32) -declare spir_func i32 @_Z15get_image_width11ocl_image2d(%opencl.image2d_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width11ocl_image2d(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)) -declare spir_func <4 x float> @_Z11read_imagef11ocl_image2d11ocl_samplerDv2_f(%opencl.image2d_t addrspace(1)*, i32, <2 x float>) +declare spir_func <4 x float> @_Z11read_imagef11ocl_image2d11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), i32, <2 x float>) diff --git a/llvm/test/CodeGen/SPIRV/literal-struct.ll b/llvm/test/CodeGen/SPIRV/literal-struct.ll deleted file mode 100644 index 2e36b7fa58350..0000000000000 --- a/llvm/test/CodeGen/SPIRV/literal-struct.ll +++ /dev/null @@ -1,47 +0,0 @@ -;; This test checks that the backend doesn't crash if the module has literal -;; structs, i.e. structs whose type has no name. Typicaly clang generate such -;; structs if the kernel contains OpenCL 2.0 blocks. The IR was produced with -;; the following command: -;; clang -cc1 -triple spir -cl-std=cl2.0 -O0 literal-struct.cl -emit-llvm -o test/literal-struct.ll - -;; literal-struct.cl: -;; void foo() -;; { -;; void (^myBlock)(void) = ^{}; -;; myBlock(); -;; } - -; RUN: llc -opaque-pointers=0 -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s - -; CHECK: OpName %[[#StructType0:]] "struct.__opencl_block_literal_generic" -; CHECK: %[[#Int8:]] = OpTypeInt 8 0 -; CHECK: %[[#Int8Ptr:]] = OpTypePointer Generic %[[#Int8]] -; CHECK: %[[#Int:]] = OpTypeInt 32 0 -; CHECK: %[[#StructType0:]] = OpTypeStruct %[[#Int]] %[[#Int]] %[[#Int8Ptr]] -; CHECK: %[[#StructType:]] = OpTypeStruct %[[#Int]] %[[#Int]] %[[#Int8Ptr]] - -%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } - -@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__foo_block_invoke to i8*) to i8 addrspace(4)*) }, align 4 -; CHECK: OpConstantComposite %[[#StructType]] - -@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } zeroinitializer, align 4 -; CHECK: OpConstantNull %[[#StructType]] - -define spir_func void @foo() { -entry: - %myBlock = alloca %struct.__opencl_block_literal_generic addrspace(4)*, align 4 - store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %myBlock, align 4 - call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) - ret void -} - -define internal spir_func void @__foo_block_invoke(i8 addrspace(4)* %.block_descriptor) { -entry: - %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4 - %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4 - store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4 - %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* - store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4 - ret void -} diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll index bab94f7450d1f..3d7d9fbf90759 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll @@ -1,60 +1,69 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV - -; CHECK-SPIRV: OpDecorate %[[#NonConstMemset:]] LinkageAttributes "spirv.llvm_memset_p3i8_i32" -; CHECK-SPIRV: %[[#Int32:]] = OpTypeInt 32 0 -; CHECK-SPIRV: %[[#Int8:]] = OpTypeInt 8 0 -; CHECK-SPIRV: %[[#Int8Ptr:]] = OpTypePointer Generic %[[#Int8]] -; CHECK-SPIRV: %[[#Lenmemset21:]] = OpConstant %[[#]] 4 -; CHECK-SPIRV: %[[#Int8x4:]] = OpTypeArray %[[#Int8]] %[[#Lenmemset21]] -; CHECK-SPIRV: %[[#Int8PtrConst:]] = OpTypePointer UniformConstant %[[#Int8]] -; CHECK-SPIRV: %[[#Lenmemset0:]] = OpConstant %[[#Int32]] 12 -; CHECK-SPIRV: %[[#Int8x12:]] = OpTypeArray %[[#Int8]] %[[#Lenmemset0]] -; CHECK-SPIRV: %[[#Const21:]] = OpConstant %[[#]] 21 -; CHECK-SPIRV: %[[#False:]] = OpConstantFalse %[[#]] -; CHECK-SPIRV: %[[#InitComp:]] = OpConstantComposite %[[#Int8x4]] %[[#Const21]] %[[#Const21]] %[[#Const21]] %[[#Const21]] -; CHECK-SPIRV: %[[#Init:]] = OpConstantNull %[[#Int8x12]] -; CHECK-SPIRV: %[[#ValComp:]] = OpVariable %[[#]] UniformConstant %[[#InitComp]] -; CHECK-SPIRV: %[[#Val:]] = OpVariable %[[#]] UniformConstant %[[#Init]] - -; CHECK-SPIRV: %[[#Target:]] = OpBitcast %[[#Int8Ptr]] %[[#]] -; CHECK-SPIRV: %[[#Source:]] = OpBitcast %[[#Int8PtrConst]] %[[#Val]] -; CHECK-SPIRV: OpCopyMemorySized %[[#Target]] %[[#Source]] %[[#Lenmemset0]] Aligned 4 - -; CHECK-SPIRV: %[[#SourceComp:]] = OpBitcast %[[#Int8PtrConst]] %[[#ValComp]] -; CHECK-SPIRV: OpCopyMemorySized %[[#]] %[[#SourceComp]] %[[#Lenmemset21]] Aligned 4 - -; CHECK-SPIRV: %[[#]] = OpFunctionCall %[[#]] %[[#NonConstMemset]] %[[#]] %[[#]] %[[#]] %[[#False]] - -; CHECK-SPIRV: %[[#NonConstMemset]] = OpFunction %[[#]] -; CHECK-SPIRV: %[[#Dest:]] = OpFunctionParameter %[[#]] -; CHECK-SPIRV: %[[#Value:]] = OpFunctionParameter %[[#]] -; CHECK-SPIRV: %[[#Len:]] = OpFunctionParameter %[[#]] -; CHECK-SPIRV: %[[#Volatile:]] = OpFunctionParameter %[[#]] - -; CHECK-SPIRV: %[[#Entry:]] = OpLabel -; CHECK-SPIRV: %[[#IsZeroLen:]] = OpIEqual %[[#]] %[[#Zero:]] %[[#Len]] -; CHECK-SPIRV: OpBranchConditional %[[#IsZeroLen]] %[[#End:]] %[[#WhileBody:]] - -; CHECK-SPIRV: %[[#WhileBody]] = OpLabel -; CHECK-SPIRV: %[[#Offset:]] = OpPhi %[[#]] %[[#Zero]] %[[#Entry]] %[[#OffsetInc:]] %[[#WhileBody]] -; CHECK-SPIRV: %[[#Ptr:]] = OpInBoundsPtrAccessChain %[[#]] %[[#Dest]] %[[#Offset]] -; CHECK-SPIRV: OpStore %[[#Ptr]] %[[#Value]] Aligned 1 -; CHECK-SPIRV: %[[#OffsetInc]] = OpIAdd %[[#]] %[[#Offset]] %[[#One:]] -; CHECK-SPIRV: %[[#NotEnd:]] = OpULessThan %[[#]] %[[#OffsetInc]] %[[#Len]] -; CHECK-SPIRV: OpBranchConditional %[[#NotEnd]] %[[#WhileBody]] %[[#End]] - -; CHECK-SPIRV: %[[#End]] = OpLabel -; CHECK-SPIRV: OpReturn - -; CHECK-SPIRV: OpFunctionEnd +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s + +; CHECK-DAG: OpDecorate %[[#Memset_p0i32:]] LinkageAttributes "spirv.llvm_memset_p0_i32" Export +; CHECK-DAG: OpDecorate %[[#Memset_p3i32:]] LinkageAttributes "spirv.llvm_memset_p3_i32" Export +; CHECK-DAG: OpDecorate %[[#Memset_p1i64:]] LinkageAttributes "spirv.llvm_memset_p1_i64" Export +; CHECK-DAG: OpDecorate %[[#Memset_p1i64:]] LinkageAttributes "spirv.llvm_memset_p1_i64.volatile" Export + +; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#Int32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#Void:]] = OpTypeVoid +; CHECK-DAG: %[[#Int8Ptr:]] = OpTypePointer Generic %[[#Int8]] + +; CHECK-DAG: %[[#Const4:]] = OpConstant %[[#Int32]] 4 +; CHECK: %[[#Int8x4:]] = OpTypeArray %[[#Int8]] %[[#Const4]] + +; CHECK-DAG: %[[#Const12:]] = OpConstant %[[#Int32]] 12 +; CHECK: %[[#Int8x12:]] = OpTypeArray %[[#Int8]] %[[#Const12]] + +; CHECK-DAG: %[[#Const21:]] = OpConstant %[[#Int8]] 21 +; CHECK-DAG: %[[#False:]] = OpConstantFalse %[[#]] +; CHECK-DAG: %[[#ConstComp:]] = OpConstantComposite %[[#Int8x4]] %[[#Const21]] %[[#Const21]] %[[#Const21]] %[[#Const21]] +; CHECK-DAG: %[[#ConstNull:]] = OpConstantNull %[[#Int8x12]] +; CHECK: %[[#VarComp:]] = OpVariable %[[#]] UniformConstant %[[#ConstComp]] +; CHECK: %[[#VarNull:]] = OpVariable %[[#]] UniformConstant %[[#ConstNull]] + +; CHECK-DAG: %[[#Int8PtrConst:]] = OpTypePointer UniformConstant %[[#Int8]] +; CHECK: %[[#Target:]] = OpBitcast %[[#Int8Ptr]] %[[#]] +; CHECK: %[[#Source:]] = OpBitcast %[[#Int8PtrConst]] %[[#VarNull]] +; CHECK: OpCopyMemorySized %[[#Target]] %[[#Source]] %[[#Const12]] Aligned 4 + +; CHECK: %[[#SourceComp:]] = OpBitcast %[[#Int8PtrConst]] %[[#VarComp]] +; CHECK: OpCopyMemorySized %[[#]] %[[#SourceComp]] %[[#Const4]] Aligned 4 + +; CHECK-SPIRV: %[[#]] = OpFunctionCall %[[#]] %[[#Memset_p0i32]] %[[#]] %[[#]] %[[#]] %[[#False]] + +; CHECK: %[[#Memset_p0i32]] = OpFunction %[[#]] +; CHECK: %[[#Dest:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#Value:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#Len:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#Volatile:]] = OpFunctionParameter %[[#]] + +; CHECK: %[[#Entry:]] = OpLabel +; CHECK: %[[#IsZeroLen:]] = OpIEqual %[[#]] %[[#Zero:]] %[[#Len]] +; CHECK: OpBranchConditional %[[#IsZeroLen]] %[[#End:]] %[[#WhileBody:]] + +; CHECK: %[[#WhileBody]] = OpLabel +; CHECK: %[[#Offset:]] = OpPhi %[[#]] %[[#Zero]] %[[#Entry]] %[[#OffsetInc:]] %[[#WhileBody]] +; CHECK: %[[#Ptr:]] = OpInBoundsPtrAccessChain %[[#]] %[[#Dest]] %[[#Offset]] +; CHECK: OpStore %[[#Ptr]] %[[#Value]] Aligned 1 +; CHECK: %[[#OffsetInc]] = OpIAdd %[[#]] %[[#Offset]] %[[#One:]] +; CHECK: %[[#NotEnd:]] = OpULessThan %[[#]] %[[#OffsetInc]] %[[#Len]] +; CHECK: OpBranchConditional %[[#NotEnd]] %[[#WhileBody]] %[[#End]] + +; CHECK: %[[#End]] = OpLabel +; CHECK: OpReturn + +; CHECK: OpFunctionEnd %struct.S1 = type { i32, i32, i32 } define spir_func void @_Z5foo11v(%struct.S1 addrspace(4)* noalias nocapture sret(%struct.S1 addrspace(4)*) %agg.result, i32 %s1, i64 %s2, i8 %v) { %x = alloca [4 x i8] %x.bc = bitcast [4 x i8]* %x to i8* - %1 = bitcast %struct.S1 addrspace(4)* %agg.result to i8 addrspace(4)* - tail call void @llvm.memset.p4i8.i32(i8 addrspace(4)* align 4 %1, i8 0, i32 12, i1 false) + %a = bitcast %struct.S1 addrspace(4)* %agg.result to i8 addrspace(4)* + tail call void @llvm.memset.p4i8.i32(i8 addrspace(4)* align 4 %a, i8 0, i32 12, i1 false) tail call void @llvm.memset.p0i8.i32(i8* align 4 %x.bc, i8 21, i32 4, i1 false) ;; non-const value @@ -64,13 +73,13 @@ define spir_func void @_Z5foo11v(%struct.S1 addrspace(4)* noalias nocapture sret tail call void @llvm.memset.p0i8.i32(i8* align 4 %x.bc, i8 %v, i32 %s1, i1 false) ;; Address spaces, non-const value and size - %a = addrspacecast i8 addrspace(4)* %1 to i8 addrspace(3)* - tail call void @llvm.memset.p3i8.i32(i8 addrspace(3)* align 4 %a, i8 %v, i32 %s1, i1 false) - %b = addrspacecast i8 addrspace(4)* %1 to i8 addrspace(1)* - tail call void @llvm.memset.p1i8.i64(i8 addrspace(1)* align 4 %b, i8 %v, i64 %s2, i1 false) + %b = addrspacecast i8 addrspace(4)* %a to i8 addrspace(3)* + tail call void @llvm.memset.p3i8.i32(i8 addrspace(3)* align 4 %b, i8 %v, i32 %s1, i1 false) + %c = addrspacecast i8 addrspace(4)* %a to i8 addrspace(1)* + tail call void @llvm.memset.p1i8.i64(i8 addrspace(1)* align 4 %c, i8 %v, i64 %s2, i1 false) ;; Volatile - tail call void @llvm.memset.p1i8.i64(i8 addrspace(1)* align 4 %b, i8 %v, i64 %s2, i1 true) + tail call void @llvm.memset.p1i8.i64(i8 addrspace(1)* align 4 %c, i8 %v, i64 %s2, i1 true) ret void } diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/sqrt.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/sqrt.ll index 5e6ad7ffbdbfe..e47b7b84c2b20 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/sqrt.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/sqrt.ll @@ -1,12 +1,12 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s -; CHECK: %[[#ExtInstSetId:]] = OpExtInstImport "OpenCL.std" -; CHECK: %[[#Float:]] = OpTypeFloat 32 -; CHECK: %[[#Double:]] = OpTypeFloat 64 -; CHECK: %[[#Double4:]] = OpTypeVector %[[#Double]] 4 -; CHECK: %[[#FloatArg:]] = OpConstant %[[#Float]] -; CHECK: %[[#DoubleArg:]] = OpConstant %[[#Double]] -; CHECK: %[[#Double4Arg:]] = OpConstantComposite %[[#Double4]] +; CHECK-DAG: %[[#ExtInstSetId:]] = OpExtInstImport "OpenCL.std" +; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#Double:]] = OpTypeFloat 64 +; CHECK-DAG: %[[#Double4:]] = OpTypeVector %[[#Double]] 4 +; CHECK-DAG: %[[#FloatArg:]] = OpConstant %[[#Float]] +; CHECK-DAG: %[[#DoubleArg:]] = OpConstant %[[#Double]] +; CHECK-DAG: %[[#Double4Arg:]] = OpConstantComposite %[[#Double4]] ;; We need to store sqrt results, otherwise isel does not emit sqrts as dead insts. define spir_func void @test_sqrt(float* %x, double* %y, <4 x double>* %z) { diff --git a/llvm/test/CodeGen/SPIRV/mangled_function.ll b/llvm/test/CodeGen/SPIRV/mangled_function.ll index 4185d68290ca5..10d713496d1ea 100644 --- a/llvm/test/CodeGen/SPIRV/mangled_function.ll +++ b/llvm/test/CodeGen/SPIRV/mangled_function.ll @@ -14,12 +14,9 @@ ; CHECK-SPIRV: OpName %[[#foo:]] "_Z3foo14ocl_image2d_ro" ; CHECK-SPIRV: %[[#foo]] = OpFunction %[[#]] -%opencl.image2d_ro_t = type opaque - -define spir_func void @bar(%opencl.image2d_ro_t addrspace(1)* %srcImage) local_unnamed_addr { -entry: - tail call spir_func void @_Z3foo14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)* %srcImage) +define spir_func void @bar(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %srcImage) local_unnamed_addr { + tail call spir_func void @_Z3foo14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %srcImage) ret void } -declare spir_func void @_Z3foo14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)*) local_unnamed_addr +declare spir_func void @_Z3foo14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)) local_unnamed_addr diff --git a/llvm/test/CodeGen/SPIRV/no_capability_shader.ll b/llvm/test/CodeGen/SPIRV/no_capability_shader.ll index 4121c03e7a830..954b7c83681bd 100644 --- a/llvm/test/CodeGen/SPIRV/no_capability_shader.ll +++ b/llvm/test/CodeGen/SPIRV/no_capability_shader.ll @@ -4,10 +4,6 @@ ; CHECK-SPIRV-NOT: OpCapability Shader -%opencl.image2d_ro_t = type opaque -%opencl.image1d_buffer_ro_t = type opaque - -define spir_kernel void @sample_test(%opencl.image2d_ro_t addrspace(1)* %src, %opencl.image1d_buffer_ro_t addrspace(1)* %buf) { -entry: +define spir_kernel void @sample_test(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src, target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 0) %buf) { ret void } diff --git a/llvm/test/CodeGen/SPIRV/opaque_pointers.ll b/llvm/test/CodeGen/SPIRV/opaque_pointers.ll index 92c29d6bab609..aaddea1372721 100644 --- a/llvm/test/CodeGen/SPIRV/opaque_pointers.ll +++ b/llvm/test/CodeGen/SPIRV/opaque_pointers.ll @@ -1,11 +1,11 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK -; CHECK: %[[#Int8Ty:]] = OpTypeInt 8 0 -; CHECK: %[[#PtrTy:]] = OpTypePointer Function %[[#Int8Ty]] -; CHECK: %[[#Int64Ty:]] = OpTypeInt 64 0 -; CHECK: %[[#FTy:]] = OpTypeFunction %[[#Int64Ty]] %[[#PtrTy]] -; CHECK: %[[#Int32Ty:]] = OpTypeInt 32 0 -; CHECK: %[[#Const:]] = OpConstant %[[#Int32Ty]] 0 +; CHECK-DAG: %[[#Int8Ty:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#PtrTy:]] = OpTypePointer Function %[[#Int8Ty]] +; CHECK-DAG: %[[#Int64Ty:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#FTy:]] = OpTypeFunction %[[#Int64Ty]] %[[#PtrTy]] +; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#Const:]] = OpConstant %[[#Int32Ty]] 0 ; CHECK: OpFunction %[[#Int64Ty]] None %[[#FTy]] ; CHECK: %[[#Parm:]] = OpFunctionParameter %[[#PtrTy]] ; CHECK: OpStore %[[#Parm]] %[[#Const]] Aligned 4 diff --git a/llvm/test/CodeGen/SPIRV/opencl.queue_t.ll b/llvm/test/CodeGen/SPIRV/opencl.queue_t.ll deleted file mode 100644 index edf4ac99aff92..0000000000000 --- a/llvm/test/CodeGen/SPIRV/opencl.queue_t.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV - -; CHECK-SPIRV: OpCapability DeviceEnqueue -; CHECK-SPIRV: OpTypeQueue - -%opencl.queue_t = type opaque - -define spir_func void @enqueue_simple_block(%opencl.queue_t* addrspace(3)* nocapture %q) { -entry: - ret void -} diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/get_global_offset.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/get_global_offset.ll index bacc3e7b4997f..da6015d23d054 100644 --- a/llvm/test/CodeGen/SPIRV/opencl/basic/get_global_offset.ll +++ b/llvm/test/CodeGen/SPIRV/opencl/basic/get_global_offset.ll @@ -1,13 +1,13 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: OpEntryPoint Kernel %[[#test_func:]] "test" ; CHECK: OpName %[[#outOffsets:]] "outOffsets" ; CHECK: OpName %[[#test_func]] "test" ; CHECK: OpName %[[#f2_decl:]] "BuiltInGlobalOffset" ; CHECK: OpDecorate %[[#f2_decl]] LinkageAttributes "BuiltInGlobalOffset" Import -; CHECK: %[[#int_ty:]] = OpTypeInt 32 0 -; CHECK: %[[#iptr_ty:]] = OpTypePointer CrossWorkgroup %[[#int_ty]] +; CHECK: %[[#int_ty:]] = OpTypeInt 8 0 ; CHECK: %[[#void_ty:]] = OpTypeVoid +; CHECK: %[[#iptr_ty:]] = OpTypePointer CrossWorkgroup %[[#int_ty]] ; CHECK: %[[#func_ty:]] = OpTypeFunction %[[#void_ty]] %[[#iptr_ty]] ; CHECK: %[[#int64_ty:]] = OpTypeInt 64 0 ; CHECK: %[[#vec_ty:]] = OpTypeVector %[[#int64_ty]] 3 diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll index 7a293c25d1579..9d759a1cf47d0 100644 --- a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll +++ b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: OpEntryPoint Kernel %[[#f1:]] "writer" ; CHECK: OpEntryPoint Kernel %[[#f2:]] "reader" diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll index a8db42bab56a4..2bc4b447c2ed1 100644 --- a/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll +++ b/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ; CHECK: %[[#i16_ty:]] = OpTypeInt 16 0 ; CHECK: %[[#v4xi16_ty:]] = OpTypeVector %[[#i16_ty]] 4 diff --git a/llvm/test/CodeGen/SPIRV/opencl/get_global_id.ll b/llvm/test/CodeGen/SPIRV/opencl/get_global_id.ll index 7512f29233722..d6074191b3acb 100644 --- a/llvm/test/CodeGen/SPIRV/opencl/get_global_id.ll +++ b/llvm/test/CodeGen/SPIRV/opencl/get_global_id.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=spirv64-unknown-unknown -opaque-pointers=0 %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ;; The set of valid inputs for get_global_id depends on the runtime NDRange, ;; but inputs outside of [0, 2] always return 0. diff --git a/llvm/test/CodeGen/SPIRV/opencl/image.ll b/llvm/test/CodeGen/SPIRV/opencl/image.ll index 1dee3aca16536..ea8208c221bbd 100644 --- a/llvm/test/CodeGen/SPIRV/opencl/image.ll +++ b/llvm/test/CodeGen/SPIRV/opencl/image.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ;; FIXME: Write tests to ensure invalid usage of image are rejected, such as: ;; - invalid AS (only global is allowed); @@ -6,28 +6,24 @@ ;; - used with invalid CV-qualifiers (const or volatile in C99). ;; FIXME: Write further tests to cover _array, _buffer, _depth, ... types. -%opencl.image1d_ro_t = type opaque ;; read_only image1d_t -%opencl.image2d_wo_t = type opaque ;; write_only image2d_t -%opencl.image3d_rw_t = type opaque ;; read_write image3d_t - define void @foo( - %opencl.image1d_ro_t addrspace(1)* %a, - %opencl.image2d_wo_t addrspace(1)* %b, - %opencl.image3d_rw_t addrspace(1)* %c, + target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %a, + target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %b, + target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 2) %c, i32 addrspace(1)* %d ) { - %pixel = call <4 x i32> @_Z11read_imagei14ocl_image1d_roi(%opencl.image1d_ro_t addrspace(1)* %a, i32 0) - call void @_Z12write_imagei14ocl_image2d_woDv2_iDv4_i(%opencl.image2d_wo_t addrspace(1)* %b, <2 x i32> zeroinitializer, <4 x i32> %pixel) - %size = call i32 @_Z15get_image_width14ocl_image3d_rw(%opencl.image3d_rw_t addrspace(1)* %c) + %pixel = call <4 x i32> @_Z11read_imagei14ocl_image1d_roi(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %a, i32 0) + call void @_Z12write_imagei14ocl_image2d_woDv2_iDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %b, <2 x i32> zeroinitializer, <4 x i32> %pixel) + %size = call i32 @_Z15get_image_width14ocl_image3d_rw(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 2) %c) store i32 %size, i32 addrspace(1)* %d ret void } -declare <4 x i32> @_Z11read_imagei14ocl_image1d_roi(%opencl.image1d_ro_t addrspace(1)*, i32) +declare <4 x i32> @_Z11read_imagei14ocl_image1d_roi(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0), i32) -declare void @_Z12write_imagei14ocl_image2d_woDv2_iDv4_i(%opencl.image2d_wo_t addrspace(1)*, <2 x i32>, <4 x i32>) +declare void @_Z12write_imagei14ocl_image2d_woDv2_iDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, <4 x i32>) -declare i32 @_Z15get_image_width14ocl_image3d_rw(%opencl.image3d_rw_t addrspace(1)*) +declare i32 @_Z15get_image_width14ocl_image3d_rw(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 2)) ;; Capabilities: diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll new file mode 100644 index 0000000000000..062863a0e3adc --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll @@ -0,0 +1,14 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s + +; CHECK: %[[#INT8:]] = OpTypeInt 8 0 +; CHECK: %[[#PTR1:]] = OpTypePointer CrossWorkgroup %[[#INT8]] +; CHECK: %[[#PTR2:]] = OpTypePointer UniformConstant %[[#INT8]] +; CHECK: %[[#]] = OpInBoundsPtrAccessChain %[[#PTR1]] %[[#]] %[[#]] +; CHECK: %[[#]] = OpInBoundsPtrAccessChain %[[#PTR2]] %[[#]] %[[#]] + +define spir_kernel void @foo(ptr addrspace(1) %a, ptr addrspace(2) %b) { +entry: + %c = getelementptr inbounds i8, ptr addrspace(1) %a, i32 1 + %d = getelementptr inbounds i8, ptr addrspace(2) %b, i32 2 + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll new file mode 100644 index 0000000000000..1b4e7a3e733fc --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll @@ -0,0 +1,16 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s + +; CHECK: %[[#INT8:]] = OpTypeInt 8 0 +; CHECK: %[[#PTR1:]] = OpTypePointer CrossWorkgroup %[[#INT8]] +; CHECK: %[[#PTR2:]] = OpTypePointer UniformConstant %[[#INT8]] +; CHECK: %[[#FNP1:]] = OpFunctionParameter %[[#PTR1]] +; CHECK: %[[#FNP2:]] = OpFunctionParameter %[[#PTR2]] +; CHECK: %[[#]] = OpLoad %[[#INT8]] %[[#FNP1]] Aligned 1 +; CHECK: %[[#]] = OpLoad %[[#INT8]] %[[#FNP2]] Aligned 1 + +define spir_kernel void @foo(ptr addrspace(1) %a, ptr addrspace(2) %b) { +entry: + %c = load i8, ptr addrspace(1) %a + %d = load i8, ptr addrspace(2) %b + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/read_image.ll b/llvm/test/CodeGen/SPIRV/read_image.ll index d6a3b482184df..ede5994279d2f 100644 --- a/llvm/test/CodeGen/SPIRV/read_image.ll +++ b/llvm/test/CodeGen/SPIRV/read_image.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: %[[#IntTy:]] = OpTypeInt ; CHECK-SPIRV: %[[#IVecTy:]] = OpTypeVector %[[#IntTy]] @@ -15,36 +15,34 @@ ;; float4 f = read_imagef(input, (int4)(0, 0, 0, 0)); ;; } -%opencl.image3d_ro_t = type opaque - -define dso_local spir_kernel void @kernelA(%opencl.image3d_ro_t addrspace(1)* %input) { +define dso_local spir_kernel void @kernelA(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %input) { entry: - %input.addr = alloca %opencl.image3d_ro_t addrspace(1)*, align 8 + %input.addr = alloca target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), align 8 %c = alloca <4 x i32>, align 16 %.compoundliteral = alloca <4 x i32>, align 16 - store %opencl.image3d_ro_t addrspace(1)* %input, %opencl.image3d_ro_t addrspace(1)** %input.addr, align 8 - %0 = load %opencl.image3d_ro_t addrspace(1)*, %opencl.image3d_ro_t addrspace(1)** %input.addr, align 8 - store <4 x i32> zeroinitializer, <4 x i32>* %.compoundliteral, align 16 - %1 = load <4 x i32>, <4 x i32>* %.compoundliteral, align 16 - %call = call spir_func <4 x i32> @_Z12read_imageui14ocl_image3d_roDv4_i(%opencl.image3d_ro_t addrspace(1)* %0, <4 x i32> noundef %1) - store <4 x i32> %call, <4 x i32>* %c, align 16 + store target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %input, ptr %input.addr, align 8 + %0 = load target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), ptr %input.addr, align 8 + store <4 x i32> zeroinitializer, ptr %.compoundliteral, align 16 + %1 = load <4 x i32>, ptr %.compoundliteral, align 16 + %call = call spir_func <4 x i32> @_Z12read_imageui14ocl_image3d_roDv4_i(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %0, <4 x i32> noundef %1) + store <4 x i32> %call, ptr %c, align 16 ret void } -declare spir_func <4 x i32> @_Z12read_imageui14ocl_image3d_roDv4_i(%opencl.image3d_ro_t addrspace(1)*, <4 x i32> noundef) +declare spir_func <4 x i32> @_Z12read_imageui14ocl_image3d_roDv4_i(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %0, <4 x i32> noundef %1) -define dso_local spir_kernel void @kernelB(%opencl.image3d_ro_t addrspace(1)* %input) { +define dso_local spir_kernel void @kernelB(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %input) { entry: - %input.addr = alloca %opencl.image3d_ro_t addrspace(1)*, align 8 + %input.addr = alloca target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), align 8 %f = alloca <4 x float>, align 16 %.compoundliteral = alloca <4 x i32>, align 16 - store %opencl.image3d_ro_t addrspace(1)* %input, %opencl.image3d_ro_t addrspace(1)** %input.addr, align 8 - %0 = load %opencl.image3d_ro_t addrspace(1)*, %opencl.image3d_ro_t addrspace(1)** %input.addr, align 8 - store <4 x i32> zeroinitializer, <4 x i32>* %.compoundliteral, align 16 - %1 = load <4 x i32>, <4 x i32>* %.compoundliteral, align 16 - %call = call spir_func <4 x float> @_Z11read_imagef14ocl_image3d_roDv4_i(%opencl.image3d_ro_t addrspace(1)* %0, <4 x i32> noundef %1) - store <4 x float> %call, <4 x float>* %f, align 16 + store target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %input, ptr %input.addr, align 8 + %0 = load target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), ptr %input.addr, align 8 + store <4 x i32> zeroinitializer, ptr %.compoundliteral, align 16 + %1 = load <4 x i32>, ptr %.compoundliteral, align 16 + %call = call spir_func <4 x float> @_Z11read_imagef14ocl_image3d_roDv4_i(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %0, <4 x i32> noundef %1) + store <4 x float> %call, ptr %f, align 16 ret void } -declare spir_func <4 x float> @_Z11read_imagef14ocl_image3d_roDv4_i(%opencl.image3d_ro_t addrspace(1)*, <4 x i32> noundef) +declare spir_func <4 x float> @_Z11read_imagef14ocl_image3d_roDv4_i(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %0, <4 x i32> noundef %1) diff --git a/llvm/test/CodeGen/SPIRV/struct.ll b/llvm/test/CodeGen/SPIRV/struct.ll index 15d251cd17f35..3c4fc81b92004 100644 --- a/llvm/test/CodeGen/SPIRV/struct.ll +++ b/llvm/test/CodeGen/SPIRV/struct.ll @@ -1,16 +1,17 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s %struct.ST = type { i32, i32, i32 } -; CHECK-SPIRV: OpName %[[#struct:]] "struct.ST" -; CHECK-SPIRV: %[[#int:]] = OpTypeInt 32 0 -; CHECK-SPIRV-DAG: %[[#struct]] = OpTypeStruct %[[#int]] %[[#int]] %[[#int]] -; CHECK-SPIRV-DAG: %[[#structP:]] = OpTypePointer Function %[[#struct]] -; CHECK-SPIRV-DAG: %[[#intP:]] = OpTypePointer Function %[[#int]] -; CHECK-SPIRV: %[[#zero:]] = OpConstant %[[#int]] 0 -; CHECK-SPIRV: %[[#one:]] = OpConstant %[[#int]] 1 -; CHECK-SPIRV: %[[#two:]] = OpConstant %[[#int]] 2 -; CHECK-SPIRV: %[[#three:]] = OpConstant %[[#int]] 3 +; CHECK-DAG: OpName %[[#struct:]] "struct.ST" +; CHECK-DAG: %[[#char:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#int:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#struct]] = OpTypeStruct %[[#int]] %[[#int]] %[[#int]] +; CHECK-DAG: %[[#structP:]] = OpTypePointer Function %[[#struct]] +; CHECK-DAG: %[[#intP:]] = OpTypePointer Function %[[#char]] +; CHECK-DAG: %[[#zero:]] = OpConstant %[[#int]] 0 +; CHECK-DAG: %[[#one:]] = OpConstant %[[#int]] 1 +; CHECK-DAG: %[[#two:]] = OpConstant %[[#int]] 2 +; CHECK-DAG: %[[#three:]] = OpConstant %[[#int]] 3 define dso_local spir_func i32 @func() { entry: diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll index f5236b365eb91..79d360324110c 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll @@ -18,7 +18,7 @@ ;; } ;; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -cl-std=CL2.0 -triple spir64-unknown-unknown -emit-llvm -include opencl-20.h BuildNDRange_2.cl -o BuildNDRange_2.ll -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-DAG: %[[#LEN2_ID:]] = OpConstant %[[#]] 2 ; CHECK-SPIRV-DAG: %[[#LEN3_ID:]] = OpConstant %[[#]] 3 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpConstantSampler.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpConstantSampler.ll index e5b721e80e585..7e0d13490fcf6 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpConstantSampler.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpConstantSampler.ll @@ -10,26 +10,23 @@ ;; read_imagef(src, sampler2, 0, 0); ;; } -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: %[[#SamplerID0:]] = OpConstantSampler %[[#]] Repeat 1 Nearest ; CHECK-SPIRV: %[[#SamplerID1:]] = OpConstantSampler %[[#]] None 0 Nearest ; CHECK-SPIRV: %[[#]] = OpSampledImage %[[#]] %[[#]] %[[#SamplerID0]] ; CHECK-SPIRV: %[[#]] = OpSampledImage %[[#]] %[[#]] %[[#SamplerID1]] -%opencl.image2d_ro_t = type opaque -%opencl.sampler_t = type opaque - -define spir_func <4 x float> @foo(%opencl.image2d_ro_t addrspace(1)* %src) local_unnamed_addr { +define spir_func <4 x float> @foo(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src) local_unnamed_addr { entry: - %0 = tail call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 23) - %1 = tail call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 0) - %call = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(%opencl.image2d_ro_t addrspace(1)* %src, %opencl.sampler_t addrspace(2)* %0, <2 x float> zeroinitializer, float 0.000000e+00) - %call1 = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(%opencl.image2d_ro_t addrspace(1)* %src, %opencl.sampler_t addrspace(2)* %1, <2 x float> zeroinitializer, float 0.000000e+00) + %0 = tail call target("spirv.Sampler") @__translate_sampler_initializer(i32 23) + %1 = tail call target("spirv.Sampler") @__translate_sampler_initializer(i32 0) + %call = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src, target("spirv.Sampler") %0, <2 x float> zeroinitializer, float 0.000000e+00) + %call1 = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %src, target("spirv.Sampler") %1, <2 x float> zeroinitializer, float 0.000000e+00) %add = fadd <4 x float> %call, %call1 ret <4 x float> %add } -declare %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32) local_unnamed_addr +declare target("spirv.Sampler") @__translate_sampler_initializer(i32) local_unnamed_addr -declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(%opencl.image2d_ro_t addrspace(1)*, %opencl.sampler_t addrspace(2)*, <2 x float>, float) local_unnamed_addr +declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), target("spirv.Sampler"), <2 x float>, float) local_unnamed_addr diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpImageQuerySize.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpImageQuerySize.ll index 3808eec72877d..0c1f8eaa34a40 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpImageQuerySize.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpImageQuerySize.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ;; Check conversion of get_image_width, get_image_height, get_image_depth, ;; get_image_array_size, and get_image_dim OCL built-ins. @@ -8,24 +8,15 @@ ; CHECK-SPIRV: %[[#ArrayTypeID:]] = OpTypeImage %[[#]] 1D 0 1 0 0 Unknown ReadOnly -%opencl.image1d_ro_t = type opaque -%opencl.image1d_buffer_ro_t = type opaque -%opencl.image1d_array_ro_t = type opaque -%opencl.image2d_ro_t = type opaque -%opencl.image2d_depth_ro_t = type opaque -%opencl.image2d_array_ro_t = type opaque -%opencl.image2d_array_depth_ro_t = type opaque -%opencl.image3d_ro_t = type opaque - ; CHECK-SPIRV: %[[#ArrayVarID:]] = OpFunctionParameter %[[#ArrayTypeID]] ; CHECK-SPIRV: %[[#]] = OpImageQuerySizeLod %[[#]] %[[#ArrayVarID]] ; CHECK-SPIRV-NOT: %[[#]] = OpExtInst %[[#]] %[[#]] get_image_array_size -define spir_kernel void @test_image1d(i32 addrspace(1)* nocapture %sizes, %opencl.image1d_ro_t addrspace(1)* %img, %opencl.image1d_buffer_ro_t addrspace(1)* %buffer, %opencl.image1d_array_ro_t addrspace(1)* %array) { - %1 = tail call spir_func i32 @_Z15get_image_width14ocl_image1d_ro(%opencl.image1d_ro_t addrspace(1)* %img) - %2 = tail call spir_func i32 @_Z15get_image_width21ocl_image1d_buffer_ro(%opencl.image1d_buffer_ro_t addrspace(1)* %buffer) - %3 = tail call spir_func i32 @_Z15get_image_width20ocl_image1d_array_ro(%opencl.image1d_array_ro_t addrspace(1)* %array) - %4 = tail call spir_func i64 @_Z20get_image_array_size20ocl_image1d_array_ro(%opencl.image1d_array_ro_t addrspace(1)* %array) +define spir_kernel void @test_image1d(i32 addrspace(1)* nocapture %sizes, target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %img, target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 0) %buffer, target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0) %array) { + %1 = tail call spir_func i32 @_Z15get_image_width14ocl_image1d_ro(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %img) + %2 = tail call spir_func i32 @_Z15get_image_width21ocl_image1d_buffer_ro(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 0) %buffer) + %3 = tail call spir_func i32 @_Z15get_image_width20ocl_image1d_array_ro(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0) %array) + %4 = tail call spir_func i64 @_Z20get_image_array_size20ocl_image1d_array_ro(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0) %array) %5 = trunc i64 %4 to i32 %6 = add nsw i32 %2, %1 %7 = add nsw i32 %6, %3 @@ -34,23 +25,23 @@ define spir_kernel void @test_image1d(i32 addrspace(1)* nocapture %sizes, %openc ret void } -declare spir_func i32 @_Z15get_image_width14ocl_image1d_ro(%opencl.image1d_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width14ocl_image1d_ro(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z15get_image_width21ocl_image1d_buffer_ro(%opencl.image1d_buffer_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width21ocl_image1d_buffer_ro(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z15get_image_width20ocl_image1d_array_ro(%opencl.image1d_array_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width20ocl_image1d_array_ro(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0)) -declare spir_func i64 @_Z20get_image_array_size20ocl_image1d_array_ro(%opencl.image1d_array_ro_t addrspace(1)*) +declare spir_func i64 @_Z20get_image_array_size20ocl_image1d_array_ro(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0)) -define spir_kernel void @test_image2d(i32 addrspace(1)* nocapture %sizes, %opencl.image2d_ro_t addrspace(1)* %img, %opencl.image2d_depth_ro_t addrspace(1)* nocapture %img_depth, %opencl.image2d_array_ro_t addrspace(1)* %array, %opencl.image2d_array_depth_ro_t addrspace(1)* nocapture %array_depth) { - %1 = tail call spir_func i32 @_Z15get_image_width14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)* %img) - %2 = tail call spir_func i32 @_Z16get_image_height14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)* %img) - %3 = tail call spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)* %img) - %4 = tail call spir_func i32 @_Z15get_image_width20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)* %array) - %5 = tail call spir_func i32 @_Z16get_image_height20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)* %array) - %6 = tail call spir_func i64 @_Z20get_image_array_size20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)* %array) +define spir_kernel void @test_image2d(i32 addrspace(1)* nocapture %sizes, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %img, target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0) %img_depth, target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %array, target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0) %array_depth) { + %1 = tail call spir_func i32 @_Z15get_image_width14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %img) + %2 = tail call spir_func i32 @_Z16get_image_height14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %img) + %3 = tail call spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %img) + %4 = tail call spir_func i32 @_Z15get_image_width20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %array) + %5 = tail call spir_func i32 @_Z16get_image_height20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %array) + %6 = tail call spir_func i64 @_Z20get_image_array_size20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %array) %7 = trunc i64 %6 to i32 - %8 = tail call spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)* %array) + %8 = tail call spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %array) %9 = add nsw i32 %2, %1 %10 = extractelement <2 x i32> %3, i32 0 %11 = add nsw i32 %9, %10 @@ -67,25 +58,25 @@ define spir_kernel void @test_image2d(i32 addrspace(1)* nocapture %sizes, %openc ret void } -declare spir_func i32 @_Z15get_image_width14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z16get_image_height14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)*) +declare spir_func i32 @_Z16get_image_height14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)) -declare spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)*) +declare spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z15get_image_width20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)) -declare spir_func i32 @_Z16get_image_height20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)*) +declare spir_func i32 @_Z16get_image_height20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)) -declare spir_func i64 @_Z20get_image_array_size20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)*) +declare spir_func i64 @_Z20get_image_array_size20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)) -declare spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)*) +declare spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)) -define spir_kernel void @test_image3d(i32 addrspace(1)* nocapture %sizes, %opencl.image3d_ro_t addrspace(1)* %img) { - %1 = tail call spir_func i32 @_Z15get_image_width14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)* %img) - %2 = tail call spir_func i32 @_Z16get_image_height14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)* %img) - %3 = tail call spir_func i32 @_Z15get_image_depth14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)* %img) - %4 = tail call spir_func <4 x i32> @_Z13get_image_dim14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)* %img) +define spir_kernel void @test_image3d(i32 addrspace(1)* nocapture %sizes, target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %img) { + %1 = tail call spir_func i32 @_Z15get_image_width14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %img) + %2 = tail call spir_func i32 @_Z16get_image_height14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %img) + %3 = tail call spir_func i32 @_Z15get_image_depth14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %img) + %4 = tail call spir_func <4 x i32> @_Z13get_image_dim14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %img) %5 = add nsw i32 %2, %1 %6 = add nsw i32 %5, %3 %7 = extractelement <4 x i32> %4, i32 0 @@ -100,18 +91,18 @@ define spir_kernel void @test_image3d(i32 addrspace(1)* nocapture %sizes, %openc ret void } -declare spir_func i32 @_Z15get_image_width14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z16get_image_height14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)*) +declare spir_func i32 @_Z16get_image_height14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z15get_image_depth14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_depth14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0)) -declare spir_func <4 x i32> @_Z13get_image_dim14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)*) +declare spir_func <4 x i32> @_Z13get_image_dim14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0)) -define spir_kernel void @test_image2d_array_depth_t(i32 addrspace(1)* nocapture %sizes, %opencl.image2d_array_depth_ro_t addrspace(1)* %array) { - %1 = tail call spir_func i32 @_Z15get_image_width26ocl_image2d_array_depth_ro(%opencl.image2d_array_depth_ro_t addrspace(1)* %array) - %2 = tail call spir_func i32 @_Z16get_image_height26ocl_image2d_array_depth_ro(%opencl.image2d_array_depth_ro_t addrspace(1)* %array) - %3 = tail call spir_func i64 @_Z20get_image_array_size26ocl_image2d_array_depth_ro(%opencl.image2d_array_depth_ro_t addrspace(1)* %array) +define spir_kernel void @test_image2d_array_depth_t(i32 addrspace(1)* nocapture %sizes, target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0) %array) { + %1 = tail call spir_func i32 @_Z15get_image_width26ocl_image2d_array_depth_ro(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0) %array) + %2 = tail call spir_func i32 @_Z16get_image_height26ocl_image2d_array_depth_ro(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0) %array) + %3 = tail call spir_func i64 @_Z20get_image_array_size26ocl_image2d_array_depth_ro(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0) %array) %4 = trunc i64 %3 to i32 %5 = add nsw i32 %2, %1 %6 = add nsw i32 %5, %4 @@ -119,8 +110,8 @@ define spir_kernel void @test_image2d_array_depth_t(i32 addrspace(1)* nocapture ret void } -declare spir_func i32 @_Z15get_image_width26ocl_image2d_array_depth_ro(%opencl.image2d_array_depth_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width26ocl_image2d_array_depth_ro(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0)) -declare spir_func i32 @_Z16get_image_height26ocl_image2d_array_depth_ro(%opencl.image2d_array_depth_ro_t addrspace(1)*) +declare spir_func i32 @_Z16get_image_height26ocl_image2d_array_depth_ro(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0)) -declare spir_func i64 @_Z20get_image_array_size26ocl_image2d_array_depth_ro(%opencl.image2d_array_depth_ro_t addrspace(1)*) +declare spir_func i64 @_Z20get_image_array_size26ocl_image2d_array_depth_ro(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0)) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpImageReadMS.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpImageReadMS.ll index 7c1a41a8e51a7..f4e34c325e601 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpImageReadMS.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpImageReadMS.ll @@ -1,16 +1,14 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: %[[#]] = OpImageRead %[[#]] %[[#]] %[[#]] Sample %[[#]] -%opencl.image2d_msaa_ro_t = type opaque - -define spir_kernel void @sample_test(%opencl.image2d_msaa_ro_t addrspace(1)* %source, i32 %sampler, <4 x float> addrspace(1)* nocapture %results) { +define spir_kernel void @sample_test(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0) %source, i32 %sampler, <4 x float> addrspace(1)* nocapture %results) { entry: %call = tail call spir_func i32 @_Z13get_global_idj(i32 0) %call1 = tail call spir_func i32 @_Z13get_global_idj(i32 1) - %call2 = tail call spir_func i32 @_Z15get_image_width19ocl_image2d_msaa_ro(%opencl.image2d_msaa_ro_t addrspace(1)* %source) - %call3 = tail call spir_func i32 @_Z16get_image_height19ocl_image2d_msaa_ro(%opencl.image2d_msaa_ro_t addrspace(1)* %source) - %call4 = tail call spir_func i32 @_Z21get_image_num_samples19ocl_image2d_msaa_ro(%opencl.image2d_msaa_ro_t addrspace(1)* %source) + %call2 = tail call spir_func i32 @_Z15get_image_width19ocl_image2d_msaa_ro(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0) %source) + %call3 = tail call spir_func i32 @_Z16get_image_height19ocl_image2d_msaa_ro(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0) %source) + %call4 = tail call spir_func i32 @_Z21get_image_num_samples19ocl_image2d_msaa_ro(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0) %source) %cmp20 = icmp eq i32 %call4, 0 br i1 %cmp20, label %for.end, label %for.body.lr.ph @@ -25,7 +23,7 @@ for.body: ; preds = %for.body.lr.ph, %fo %tmp = add i32 %mul5, %call1 %tmp19 = mul i32 %tmp, %call2 %add7 = add i32 %tmp19, %call - %call9 = tail call spir_func <4 x float> @_Z11read_imagef19ocl_image2d_msaa_roDv2_ii(%opencl.image2d_msaa_ro_t addrspace(1)* %source, <2 x i32> %vecinit8, i32 %sample.021) + %call9 = tail call spir_func <4 x float> @_Z11read_imagef19ocl_image2d_msaa_roDv2_ii(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0) %source, <2 x i32> %vecinit8, i32 %sample.021) %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %results, i32 %add7 store <4 x float> %call9, <4 x float> addrspace(1)* %arrayidx, align 16 %inc = add nuw i32 %sample.021, 1 @@ -41,10 +39,10 @@ for.end: ; preds = %for.end.loopexit, % declare spir_func i32 @_Z13get_global_idj(i32) -declare spir_func i32 @_Z15get_image_width19ocl_image2d_msaa_ro(%opencl.image2d_msaa_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width19ocl_image2d_msaa_ro(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0)) -declare spir_func i32 @_Z16get_image_height19ocl_image2d_msaa_ro(%opencl.image2d_msaa_ro_t addrspace(1)*) +declare spir_func i32 @_Z16get_image_height19ocl_image2d_msaa_ro(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0)) -declare spir_func i32 @_Z21get_image_num_samples19ocl_image2d_msaa_ro(%opencl.image2d_msaa_ro_t addrspace(1)*) +declare spir_func i32 @_Z21get_image_num_samples19ocl_image2d_msaa_ro(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0)) -declare spir_func <4 x float> @_Z11read_imagef19ocl_image2d_msaa_roDv2_ii(%opencl.image2d_msaa_ro_t addrspace(1)*, <2 x i32>, i32) +declare spir_func <4 x float> @_Z11read_imagef19ocl_image2d_msaa_roDv2_ii(target("spirv.Image", void, 1, 0, 0, 1, 0, 0, 0), <2 x i32>, i32) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpImageSampleExplicitLod.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpImageSampleExplicitLod.ll index 8f2753d71c18f..989a724469c39 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpImageSampleExplicitLod.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpImageSampleExplicitLod.ll @@ -1,16 +1,14 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-DAG: %[[#RetID:]] = OpImageSampleExplicitLod %[[#RetType:]] %[[#]] %[[#]] Lod %[[#]] ; CHECK-SPIRV-DAG: %[[#RetType]] = OpTypeVector %[[#]] 4 ; CHECK-SPIRV: %[[#]] = OpCompositeExtract %[[#]] %[[#RetID]] 0 -%opencl.image2d_depth_ro_t = type opaque - -define spir_kernel void @sample_kernel(%opencl.image2d_depth_ro_t addrspace(1)* %input, i32 %imageSampler, float addrspace(1)* %xOffsets, float addrspace(1)* %yOffsets, float addrspace(1)* %results) { +define spir_kernel void @sample_kernel(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0) %input, i32 %imageSampler, float addrspace(1)* %xOffsets, float addrspace(1)* %yOffsets, float addrspace(1)* %results) { entry: %call = call spir_func i32 @_Z13get_global_idj(i32 0) %call1 = call spir_func i32 @_Z13get_global_idj(i32 1) - %call2.tmp1 = call spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_depth_ro(%opencl.image2d_depth_ro_t addrspace(1)* %input) + %call2.tmp1 = call spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_depth_ro(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0) %input) %call2.old = extractelement <2 x i32> %call2.tmp1, i32 0 %mul = mul i32 %call1, %call2.old %add = add i32 %mul, %call @@ -22,14 +20,14 @@ entry: %1 = load float, float addrspace(1)* %arrayidx3, align 4 %conv4 = fptosi float %1 to i32 %vecinit5 = insertelement <2 x i32> %vecinit, i32 %conv4, i32 1 - %call6.tmp.tmp = call spir_func float @_Z11read_imagef20ocl_image2d_depth_ro11ocl_samplerDv2_i(%opencl.image2d_depth_ro_t addrspace(1)* %input, i32 %imageSampler, <2 x i32> %vecinit5) + %call6.tmp.tmp = call spir_func float @_Z11read_imagef20ocl_image2d_depth_ro11ocl_samplerDv2_i(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0) %input, i32 %imageSampler, <2 x i32> %vecinit5) %arrayidx7 = getelementptr inbounds float, float addrspace(1)* %results, i32 %add store float %call6.tmp.tmp, float addrspace(1)* %arrayidx7, align 4 ret void } -declare spir_func float @_Z11read_imagef20ocl_image2d_depth_ro11ocl_samplerDv2_i(%opencl.image2d_depth_ro_t addrspace(1)*, i32, <2 x i32>) +declare spir_func float @_Z11read_imagef20ocl_image2d_depth_ro11ocl_samplerDv2_i(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0), i32, <2 x i32>) declare spir_func i32 @_Z13get_global_idj(i32) -declare spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_depth_ro(%opencl.image2d_depth_ro_t addrspace(1)*) +declare spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_depth_ro(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0)) \ No newline at end of file diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpImageWrite.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpImageWrite.ll index ef44126d800cf..30ef614bfa46d 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpImageWrite.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpImageWrite.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: %[[#VOID_TY:]] = OpTypeVoid ; CHECK-SPIRV: %[[#IMG2D_WO_TY:]] = OpTypeImage %[[#VOID_TY]] 2D 0 0 0 0 Unknown WriteOnly @@ -16,21 +16,6 @@ ; CHECK-SPIRV: %[[#IMG3D_WO_TY:]] = OpTypeImage %[[#VOID_TY]] 3D 0 0 0 0 Unknown WriteOnly ; CHECK-SPIRV: %[[#IMG3D_RW_TY:]] = OpTypeImage %[[#VOID_TY]] 3D 0 0 0 0 Unknown ReadWrite -%opencl.image2d_wo_t = type opaque -%opencl.image2d_rw_t = type opaque -%opencl.image2d_array_wo_t = type opaque -%opencl.image2d_array_rw_t = type opaque -%opencl.image1d_wo_t = type opaque -%opencl.image1d_rw_t = type opaque -%opencl.image1d_buffer_wo_t = type opaque -%opencl.image1d_buffer_rw_t = type opaque -%opencl.image1d_array_wo_t = type opaque -%opencl.image1d_array_rw_t = type opaque -%opencl.image2d_depth_wo_t = type opaque -%opencl.image2d_array_depth_wo_t = type opaque -%opencl.image3d_wo_t = type opaque -%opencl.image3d_rw_t = type opaque - ;; kernel void test_img2d(write_only image2d_t image_wo, read_write image2d_t image_rw) ;; { ;; write_imagef(image_wo, (int2)(0,0), (float4)(0,0,0,0)); @@ -53,28 +38,28 @@ ; CHECK-SPIRV: OpImageWrite %[[#IMG2D_WO]] ; CHECK-SPIRV: OpImageWrite %[[#IMG2D_WO]] -define dso_local spir_kernel void @test_img2d(%opencl.image2d_wo_t addrspace(1)* %image_wo, %opencl.image2d_rw_t addrspace(1)* %image_rw) local_unnamed_addr { +define dso_local spir_kernel void @test_img2d(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %image_wo, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 2) %image_rw) local_unnamed_addr { entry: - call spir_func void @_Z12write_imagef14ocl_image2d_woDv2_iDv4_f(%opencl.image2d_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iDv4_i(%opencl.image2d_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef14ocl_image2d_rwDv2_iDv4_f(%opencl.image2d_rw_t addrspace(1)* %image_rw, <2 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image2d_rwDv2_iDv4_i(%opencl.image2d_rw_t addrspace(1)* %image_rw, <2 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef14ocl_image2d_woDv2_iiDv4_f(%opencl.image2d_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iiDv4_i(%opencl.image2d_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image2d_woDv2_iDv4_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image2d_rwDv2_iDv4_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 2) %image_rw, <2 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image2d_rwDv2_iDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 2) %image_rw, <2 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image2d_woDv2_iiDv4_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iiDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, <4 x i32> noundef zeroinitializer) ret void } -declare spir_func void @_Z12write_imagef14ocl_image2d_woDv2_iDv4_f(%opencl.image2d_wo_t addrspace(1)*, <2 x i32> noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image2d_woDv2_iDv4_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32> noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iDv4_i(%opencl.image2d_wo_t addrspace(1)*, <2 x i32> noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32> noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef14ocl_image2d_rwDv2_iDv4_f(%opencl.image2d_rw_t addrspace(1)*, <2 x i32> noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image2d_rwDv2_iDv4_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 2), <2 x i32> noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image2d_rwDv2_iDv4_i(%opencl.image2d_rw_t addrspace(1)*, <2 x i32> noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image2d_rwDv2_iDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 2), <2 x i32> noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef14ocl_image2d_woDv2_iiDv4_f(%opencl.image2d_wo_t addrspace(1)*, <2 x i32> noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image2d_woDv2_iiDv4_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32> noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iiDv4_i(%opencl.image2d_wo_t addrspace(1)*, <2 x i32> noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iiDv4_i(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32> noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr ;; kernel void test_img2d_array(write_only image2d_array_t image_wo, read_write image2d_array_t image_rw) ;; { @@ -98,28 +83,28 @@ declare spir_func void @_Z12write_imagei14ocl_image2d_woDv2_iiDv4_i(%opencl.imag ; CHECK-SPIRV: OpImageWrite %[[#IMG2D_ARRAY_WO]] ; CHECK-SPIRV: OpImageWrite %[[#IMG2D_ARRAY_WO]] -define dso_local spir_kernel void @test_img2d_array(%opencl.image2d_array_wo_t addrspace(1)* %image_wo, %opencl.image2d_array_rw_t addrspace(1)* %image_rw) local_unnamed_addr { +define dso_local spir_kernel void @test_img2d_array(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1) %image_wo, target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 2) %image_rw) local_unnamed_addr { entry: - call spir_func void @_Z12write_imagef20ocl_image2d_array_woDv4_iDv4_f(%opencl.image2d_array_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iS0_(%opencl.image2d_array_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef20ocl_image2d_array_rwDv4_iDv4_f(%opencl.image2d_array_rw_t addrspace(1)* %image_rw, <4 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei20ocl_image2d_array_rwDv4_iS0_(%opencl.image2d_array_rw_t addrspace(1)* %image_rw, <4 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef20ocl_image2d_array_woDv4_iiDv4_f(%opencl.image2d_array_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iiS0_(%opencl.image2d_array_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef20ocl_image2d_array_woDv4_iDv4_f(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iS0_(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef20ocl_image2d_array_rwDv4_iDv4_f(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 2) %image_rw, <4 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei20ocl_image2d_array_rwDv4_iS0_(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 2) %image_rw, <4 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef20ocl_image2d_array_woDv4_iiDv4_f(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iiS0_(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, <4 x i32> noundef zeroinitializer) ret void } -declare spir_func void @_Z12write_imagef20ocl_image2d_array_woDv4_iDv4_f(%opencl.image2d_array_wo_t addrspace(1)*, <4 x i32> noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef20ocl_image2d_array_woDv4_iDv4_f(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1), <4 x i32> noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iS0_(%opencl.image2d_array_wo_t addrspace(1)*, <4 x i32> noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iS0_(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1), <4 x i32> noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef20ocl_image2d_array_rwDv4_iDv4_f(%opencl.image2d_array_rw_t addrspace(1)*, <4 x i32> noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef20ocl_image2d_array_rwDv4_iDv4_f(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 2), <4 x i32> noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei20ocl_image2d_array_rwDv4_iS0_(%opencl.image2d_array_rw_t addrspace(1)*, <4 x i32> noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei20ocl_image2d_array_rwDv4_iS0_(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 2), <4 x i32> noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef20ocl_image2d_array_woDv4_iiDv4_f(%opencl.image2d_array_wo_t addrspace(1)*, <4 x i32> noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef20ocl_image2d_array_woDv4_iiDv4_f(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1), <4 x i32> noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iiS0_(%opencl.image2d_array_wo_t addrspace(1)*, <4 x i32> noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iiS0_(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 1), <4 x i32> noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr ;; kernel void test_img1d(write_only image1d_t image_wo, read_write image1d_t image_rw) ;; { @@ -143,28 +128,28 @@ declare spir_func void @_Z12write_imagei20ocl_image2d_array_woDv4_iiS0_(%opencl. ; CHECK-SPIRV: OpImageWrite %[[#IMG1D_WO]] ; CHECK-SPIRV: OpImageWrite %[[#IMG1D_WO]] -define dso_local spir_kernel void @test_img1d(%opencl.image1d_wo_t addrspace(1)* %image_wo, %opencl.image1d_rw_t addrspace(1)* %image_rw) local_unnamed_addr { +define dso_local spir_kernel void @test_img1d(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) %image_wo, target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2) %image_rw) local_unnamed_addr { entry: - call spir_func void @_Z12write_imagef14ocl_image1d_woiDv4_f(%opencl.image1d_wo_t addrspace(1)* %image_wo, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image1d_woiDv4_i(%opencl.image1d_wo_t addrspace(1)* %image_wo, i32 noundef 0, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef14ocl_image1d_rwiDv4_f(%opencl.image1d_rw_t addrspace(1)* %image_rw, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image1d_rwiDv4_i(%opencl.image1d_rw_t addrspace(1)* %image_rw, i32 noundef 0, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef14ocl_image1d_woiiDv4_f(%opencl.image1d_wo_t addrspace(1)* %image_wo, i32 noundef 0, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image1d_woiiDv4_i(%opencl.image1d_wo_t addrspace(1)* %image_wo, i32 noundef 0, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image1d_woiDv4_f(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) %image_wo, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image1d_woiDv4_i(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) %image_wo, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image1d_rwiDv4_f(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2) %image_rw, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image1d_rwiDv4_i(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2) %image_rw, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image1d_woiiDv4_f(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) %image_wo, i32 noundef 0, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image1d_woiiDv4_i(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) %image_wo, i32 noundef 0, i32 noundef 0, <4 x i32> noundef zeroinitializer) ret void } -declare spir_func void @_Z12write_imagef14ocl_image1d_woiDv4_f(%opencl.image1d_wo_t addrspace(1)*, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image1d_woiDv4_f(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1), i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image1d_woiDv4_i(%opencl.image1d_wo_t addrspace(1)*, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image1d_woiDv4_i(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1), i32 noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef14ocl_image1d_rwiDv4_f(%opencl.image1d_rw_t addrspace(1)*, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image1d_rwiDv4_f(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2), i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image1d_rwiDv4_i(%opencl.image1d_rw_t addrspace(1)*, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image1d_rwiDv4_i(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2), i32 noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef14ocl_image1d_woiiDv4_f(%opencl.image1d_wo_t addrspace(1)*, i32 noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image1d_woiiDv4_f(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1), i32 noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image1d_woiiDv4_i(%opencl.image1d_wo_t addrspace(1)*, i32 noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image1d_woiiDv4_i(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1), i32 noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr ;; kernel void test_img1d_buffer(write_only image1d_buffer_t image_wo, read_write image1d_buffer_t image_rw) ;; { @@ -182,22 +167,22 @@ declare spir_func void @_Z12write_imagei14ocl_image1d_woiiDv4_i(%opencl.image1d_ ; CHECK-SPIRV: OpImageWrite %[[#IMG1D_BUFFER_RW]] ; CHECK-SPIRV: OpImageWrite %[[#IMG1D_BUFFER_RW]] -define dso_local spir_kernel void @test_img1d_buffer(%opencl.image1d_buffer_wo_t addrspace(1)* %image_wo, %opencl.image1d_buffer_rw_t addrspace(1)* %image_rw) local_unnamed_addr { +define dso_local spir_kernel void @test_img1d_buffer(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 1) %image_wo, target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 2) %image_rw) local_unnamed_addr { entry: - call spir_func void @_Z12write_imagef21ocl_image1d_buffer_woiDv4_f(%opencl.image1d_buffer_wo_t addrspace(1)* %image_wo, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei21ocl_image1d_buffer_woiDv4_i(%opencl.image1d_buffer_wo_t addrspace(1)* %image_wo, i32 noundef 0, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef21ocl_image1d_buffer_rwiDv4_f(%opencl.image1d_buffer_rw_t addrspace(1)* %image_rw, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei21ocl_image1d_buffer_rwiDv4_i(%opencl.image1d_buffer_rw_t addrspace(1)* %image_rw, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef21ocl_image1d_buffer_woiDv4_f(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 1) %image_wo, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei21ocl_image1d_buffer_woiDv4_i(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 1) %image_wo, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef21ocl_image1d_buffer_rwiDv4_f(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 2) %image_rw, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei21ocl_image1d_buffer_rwiDv4_i(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 2) %image_rw, i32 noundef 0, <4 x i32> noundef zeroinitializer) ret void } -declare spir_func void @_Z12write_imagef21ocl_image1d_buffer_woiDv4_f(%opencl.image1d_buffer_wo_t addrspace(1)*, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef21ocl_image1d_buffer_woiDv4_f(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 1), i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei21ocl_image1d_buffer_woiDv4_i(%opencl.image1d_buffer_wo_t addrspace(1)*, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei21ocl_image1d_buffer_woiDv4_i(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 1), i32 noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef21ocl_image1d_buffer_rwiDv4_f(%opencl.image1d_buffer_rw_t addrspace(1)*, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef21ocl_image1d_buffer_rwiDv4_f(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 2), i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei21ocl_image1d_buffer_rwiDv4_i(%opencl.image1d_buffer_rw_t addrspace(1)*, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei21ocl_image1d_buffer_rwiDv4_i(target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 2), i32 noundef, <4 x i32> noundef) local_unnamed_addr ;; kernel void test_img1d_array(write_only image1d_array_t image_wo, read_write image1d_array_t image_rw) ;; { @@ -221,28 +206,28 @@ declare spir_func void @_Z12write_imagei21ocl_image1d_buffer_rwiDv4_i(%opencl.im ; CHECK-SPIRV: OpImageWrite %[[#IMG1D_ARRAY_WO]] ; CHECK-SPIRV: OpImageWrite %[[#IMG1D_ARRAY_WO]] -define dso_local spir_kernel void @test_img1d_array(%opencl.image1d_array_wo_t addrspace(1)* %image_wo, %opencl.image1d_array_rw_t addrspace(1)* %image_rw) local_unnamed_addr { +define dso_local spir_kernel void @test_img1d_array(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1) %image_wo, target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 2) %image_rw) local_unnamed_addr { entry: - call spir_func void @_Z12write_imagef20ocl_image1d_array_woDv2_iDv4_f(%opencl.image1d_array_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iDv4_i(%opencl.image1d_array_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef20ocl_image1d_array_rwDv2_iDv4_f(%opencl.image1d_array_rw_t addrspace(1)* %image_rw, <2 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei20ocl_image1d_array_rwDv2_iDv4_i(%opencl.image1d_array_rw_t addrspace(1)* %image_rw, <2 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef20ocl_image1d_array_woDv2_iiDv4_f(%opencl.image1d_array_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iiDv4_i(%opencl.image1d_array_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef20ocl_image1d_array_woDv2_iDv4_f(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iDv4_i(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef20ocl_image1d_array_rwDv2_iDv4_f(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 2) %image_rw, <2 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei20ocl_image1d_array_rwDv2_iDv4_i(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 2) %image_rw, <2 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef20ocl_image1d_array_woDv2_iiDv4_f(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iiDv4_i(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, <4 x i32> noundef zeroinitializer) ret void } -declare spir_func void @_Z12write_imagef20ocl_image1d_array_woDv2_iDv4_f(%opencl.image1d_array_wo_t addrspace(1)*, <2 x i32> noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef20ocl_image1d_array_woDv2_iDv4_f(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1), <2 x i32> noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iDv4_i(%opencl.image1d_array_wo_t addrspace(1)*, <2 x i32> noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iDv4_i(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1), <2 x i32> noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef20ocl_image1d_array_rwDv2_iDv4_f(%opencl.image1d_array_rw_t addrspace(1)*, <2 x i32> noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef20ocl_image1d_array_rwDv2_iDv4_f(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 2), <2 x i32> noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei20ocl_image1d_array_rwDv2_iDv4_i(%opencl.image1d_array_rw_t addrspace(1)*, <2 x i32> noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei20ocl_image1d_array_rwDv2_iDv4_i(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 2), <2 x i32> noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef20ocl_image1d_array_woDv2_iiDv4_f(%opencl.image1d_array_wo_t addrspace(1)*, <2 x i32> noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef20ocl_image1d_array_woDv2_iiDv4_f(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1), <2 x i32> noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iiDv4_i(%opencl.image1d_array_wo_t addrspace(1)*, <2 x i32> noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iiDv4_i(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 1), <2 x i32> noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr ;; kernel void test_img2d_depth(write_only image2d_depth_t image_wo) ;; { @@ -259,17 +244,17 @@ declare spir_func void @_Z12write_imagei20ocl_image1d_array_woDv2_iiDv4_i(%openc ; CHECK-SPIRV: OpImageWrite %[[#IMG2D_DEPTH_WO]] ; CHECK-SPIRV: OpImageWrite %[[#IMG2D_DEPTH_WO]] -define dso_local spir_kernel void @test_img2d_depth(%opencl.image2d_depth_wo_t addrspace(1)* %image_wo) local_unnamed_addr { +define dso_local spir_kernel void @test_img2d_depth(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 1) %image_wo) local_unnamed_addr { entry: - call spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_if(%opencl.image2d_depth_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, float noundef 0.000000e+00) - call spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_if(%opencl.image2d_depth_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, float noundef 0.000000e+00) - call spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_iif(%opencl.image2d_depth_wo_t addrspace(1)* %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, float noundef 0.000000e+00) + call spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_if(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, float noundef 0.000000e+00) + call spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_if(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, float noundef 0.000000e+00) + call spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_iif(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 1) %image_wo, <2 x i32> noundef zeroinitializer, i32 noundef 0, float noundef 0.000000e+00) ret void } -declare spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_if(%opencl.image2d_depth_wo_t addrspace(1)*, <2 x i32> noundef, float noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_if(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 1), <2 x i32> noundef, float noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_iif(%opencl.image2d_depth_wo_t addrspace(1)*, <2 x i32> noundef, i32 noundef, float noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_iif(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 1), <2 x i32> noundef, i32 noundef, float noundef) local_unnamed_addr ;; kernel void test_img2d_array_depth(write_only image2d_array_depth_t image_wo) ;; { @@ -286,17 +271,17 @@ declare spir_func void @_Z12write_imagef20ocl_image2d_depth_woDv2_iif(%opencl.im ; CHECK-SPIRV: OpImageWrite %[[#IMG2D_ARRAY_DEPTH_WO]] ; CHECK-SPIRV: OpImageWrite %[[#IMG2D_ARRAY_DEPTH_WO]] -define dso_local spir_kernel void @test_img2d_array_depth(%opencl.image2d_array_depth_wo_t addrspace(1)* %image_wo) local_unnamed_addr { +define dso_local spir_kernel void @test_img2d_array_depth(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 1) %image_wo) local_unnamed_addr { entry: - call spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_if(%opencl.image2d_array_depth_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, float noundef 0.000000e+00) - call spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_if(%opencl.image2d_array_depth_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, float noundef 0.000000e+00) - call spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_iif(%opencl.image2d_array_depth_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, float noundef 0.000000e+00) + call spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_if(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, float noundef 0.000000e+00) + call spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_if(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, float noundef 0.000000e+00) + call spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_iif(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, float noundef 0.000000e+00) ret void } -declare spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_if(%opencl.image2d_array_depth_wo_t addrspace(1)*, <4 x i32> noundef, float noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_if(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 1), <4 x i32> noundef, float noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_iif(%opencl.image2d_array_depth_wo_t addrspace(1)*, <4 x i32> noundef, i32 noundef, float noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_iif(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 1), <4 x i32> noundef, i32 noundef, float noundef) local_unnamed_addr ;; kernel void test_img3d(write_only image3d_t image_wo, read_write image3d_t image_rw) ;; { @@ -320,25 +305,25 @@ declare spir_func void @_Z12write_imagef26ocl_image2d_array_depth_woDv4_iif(%ope ; CHECK-SPIRV: OpImageWrite %[[#IMG3D_WO]] ; CHECK-SPIRV: OpImageWrite %[[#IMG3D_WO]] -define dso_local spir_kernel void @test_img3d(%opencl.image3d_wo_t addrspace(1)* %image_wo, %opencl.image3d_rw_t addrspace(1)* %image_rw) local_unnamed_addr { +define dso_local spir_kernel void @test_img3d(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) %image_wo, target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 2) %image_rw) local_unnamed_addr { entry: - call spir_func void @_Z12write_imagef14ocl_image3d_woDv4_iDv4_f(%opencl.image3d_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image3d_woDv4_iS0_(%opencl.image3d_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef14ocl_image3d_rwDv4_iDv4_f(%opencl.image3d_rw_t addrspace(1)* %image_rw, <4 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image3d_rwDv4_iS0_(%opencl.image3d_rw_t addrspace(1)* %image_rw, <4 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) - call spir_func void @_Z12write_imagef14ocl_image3d_woDv4_iiDv4_f(%opencl.image3d_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, <4 x float> noundef zeroinitializer) - call spir_func void @_Z12write_imagei14ocl_image3d_woDv4_iiS0_(%opencl.image3d_wo_t addrspace(1)* %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image3d_woDv4_iDv4_f(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image3d_woDv4_iS0_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image3d_rwDv4_iDv4_f(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 2) %image_rw, <4 x i32> noundef zeroinitializer, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image3d_rwDv4_iS0_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 2) %image_rw, <4 x i32> noundef zeroinitializer, <4 x i32> noundef zeroinitializer) + call spir_func void @_Z12write_imagef14ocl_image3d_woDv4_iiDv4_f(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, <4 x float> noundef zeroinitializer) + call spir_func void @_Z12write_imagei14ocl_image3d_woDv4_iiS0_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) %image_wo, <4 x i32> noundef zeroinitializer, i32 noundef 0, <4 x i32> noundef zeroinitializer) ret void } -declare spir_func void @_Z12write_imagef14ocl_image3d_woDv4_iDv4_f(%opencl.image3d_wo_t addrspace(1)*, <4 x i32> noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image3d_woDv4_iDv4_f(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32> noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image3d_woDv4_iS0_(%opencl.image3d_wo_t addrspace(1)*, <4 x i32> noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image3d_woDv4_iS0_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32> noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef14ocl_image3d_rwDv4_iDv4_f(%opencl.image3d_rw_t addrspace(1)*, <4 x i32> noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image3d_rwDv4_iDv4_f(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 2), <4 x i32> noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image3d_rwDv4_iS0_(%opencl.image3d_rw_t addrspace(1)*, <4 x i32> noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image3d_rwDv4_iS0_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 2), <4 x i32> noundef, <4 x i32> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagef14ocl_image3d_woDv4_iiDv4_f(%opencl.image3d_wo_t addrspace(1)*, <4 x i32> noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagef14ocl_image3d_woDv4_iiDv4_f(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32> noundef, i32 noundef, <4 x float> noundef) local_unnamed_addr -declare spir_func void @_Z12write_imagei14ocl_image3d_woDv4_iiS0_(%opencl.image3d_wo_t addrspace(1)*, <4 x i32> noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr +declare spir_func void @_Z12write_imagei14ocl_image3d_woDv4_iiS0_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32> noundef, i32 noundef, <4 x i32> noundef) local_unnamed_addr diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll index 9297d17cfa323..f4e6c83a8cd51 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ;; This test checks that the backend is capable to correctly translate ;; atomic_cmpxchg OpenCL C 1.2 built-in function [1] into corresponding SPIR-V @@ -14,8 +14,9 @@ ;; } ; CHECK-SPIRV: OpName %[[#TEST:]] "test_atomic_cmpxchg" +; CHECK-SPIRV-DAG: %[[#UCHAR:]] = OpTypeInt 8 0 ; CHECK-SPIRV-DAG: %[[#UINT:]] = OpTypeInt 32 0 -; CHECK-SPIRV-DAG: %[[#UINT_PTR:]] = OpTypePointer CrossWorkgroup %[[#UINT]] +; CHECK-SPIRV-DAG: %[[#UCHAR_PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR]] ;; In SPIR-V, atomic_cmpxchg is represented as OpAtomicCompareExchange [2], ;; which also includes memory scope and two memory semantic arguments. The @@ -30,7 +31,7 @@ ; CHECK-SPIRV-DAG: %[[#RELAXED:]] = OpConstant %[[#UINT]] 0 ; CHECK-SPIRV: %[[#TEST]] = OpFunction %[[#]] -; CHECK-SPIRV: %[[#PTR:]] = OpFunctionParameter %[[#UINT_PTR]] +; CHECK-SPIRV: %[[#PTR:]] = OpFunctionParameter %[[#UCHAR_PTR]] ; CHECK-SPIRV: %[[#CMP:]] = OpFunctionParameter %[[#UINT]] ; CHECK-SPIRV: %[[#VAL:]] = OpFunctionParameter %[[#UINT]] ; CHECK-SPIRV: %[[#]] = OpAtomicCompareExchange %[[#UINT]] %[[#PTR]] %[[#WORKGROUP_SCOPE]] %[[#RELAXED]] %[[#RELAXED]] %[[#VAL]] %[[#CMP]] diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll index 741fbf88fec0b..3e805e6eea97d 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ;; This test checks that the backend is capable to correctly translate ;; legacy atomic OpenCL C 1.2 built-in functions [1] into corresponding SPIR-V @@ -11,7 +11,8 @@ ; CHECK-SPIRV: OpName %[[#TEST:]] "test_legacy_atomics" ; CHECK-SPIRV-DAG: %[[#UINT:]] = OpTypeInt 32 0 -; CHECK-SPIRV-DAG: %[[#UINT_PTR:]] = OpTypePointer CrossWorkgroup %[[#UINT]] +; CHECK-SPIRV-DAG: %[[#UCHAR:]] = OpTypeInt 8 0 +; CHECK-SPIRV-DAG: %[[#UCHAR_PTR:]] = OpTypePointer CrossWorkgroup %[[#UCHAR]] ;; In SPIR-V, atomic_add is represented as OpAtomicIAdd [2], which also includes ;; memory scope and memory semantic arguments. The backend applies a default @@ -25,7 +26,7 @@ ; CHECK-SPIRV-DAG: %[[#RELAXED:]] = OpConstant %[[#UINT]] 0 ; CHECK-SPIRV: %[[#TEST]] = OpFunction %[[#]] -; CHECK-SPIRV: %[[#PTR:]] = OpFunctionParameter %[[#UINT_PTR]] +; CHECK-SPIRV: %[[#PTR:]] = OpFunctionParameter %[[#UCHAR_PTR]] ; CHECK-SPIRV: %[[#VAL:]] = OpFunctionParameter %[[#UINT]] ; CHECK-SPIRV: %[[#]] = OpAtomicIAdd %[[#UINT]] %[[#PTR]] %[[#WORKGROUP_SCOPE]] %[[#RELAXED]] %[[#VAL]] ; CHECK-SPIRV: %[[#]] = OpAtomicIAdd %[[#UINT]] %[[#PTR]] %[[#WORKGROUP_SCOPE]] %[[#RELAXED]] %[[#VAL]] diff --git a/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll b/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll index ac9fa0b1335f3..96e1d518a2d51 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/SampledImage.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ;; constant sampler_t constSampl = CLK_FILTER_LINEAR; ;; @@ -16,9 +16,6 @@ ;; *results = read_imagei(input, CLK_FILTER_NEAREST|CLK_ADDRESS_REPEAT, coords); ;; } -%opencl.image2d_ro_t = type opaque -%opencl.sampler_t = type opaque - ; CHECK-SPIRV: OpCapability LiteralSampler ; CHECK-SPIRV: OpName %[[#sample_kernel_float:]] "sample_kernel_float" ; CHECK-SPIRV: OpName %[[#sample_kernel_int:]] "sample_kernel_int" @@ -43,22 +40,22 @@ ; CHECK-SPIRV: %[[#SampledImage3:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler2]] ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage3]] -define dso_local spir_kernel void @sample_kernel_float(%opencl.image2d_ro_t addrspace(1)* %input, <2 x float> noundef %coords, <4 x float> addrspace(1)* nocapture noundef writeonly %results, %opencl.sampler_t addrspace(2)* %argSampl) local_unnamed_addr { +define dso_local spir_kernel void @sample_kernel_float(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, <2 x float> noundef %coords, <4 x float> addrspace(1)* nocapture noundef writeonly %results, target("spirv.Sampler") %argSampl) local_unnamed_addr { entry: - %0 = tail call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 32) - %call = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_f(%opencl.image2d_ro_t addrspace(1)* %input, %opencl.sampler_t addrspace(2)* %0, <2 x float> noundef %coords) + %0 = tail call spir_func target("spirv.Sampler") @__translate_sampler_initializer(i32 32) + %call = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, target("spirv.Sampler") %0, <2 x float> noundef %coords) store <4 x float> %call, <4 x float> addrspace(1)* %results, align 16 - %call1 = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_f(%opencl.image2d_ro_t addrspace(1)* %input, %opencl.sampler_t addrspace(2)* %argSampl, <2 x float> noundef %coords) + %call1 = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, target("spirv.Sampler") %argSampl, <2 x float> noundef %coords) store <4 x float> %call1, <4 x float> addrspace(1)* %results, align 16 - %1 = tail call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 22) - %call2 = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_f(%opencl.image2d_ro_t addrspace(1)* %input, %opencl.sampler_t addrspace(2)* %1, <2 x float> noundef %coords) + %1 = tail call spir_func target("spirv.Sampler") @__translate_sampler_initializer(i32 22) + %call2 = tail call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, target("spirv.Sampler") %1, <2 x float> noundef %coords) store <4 x float> %call2, <4 x float> addrspace(1)* %results, align 16 ret void } -declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_f(%opencl.image2d_ro_t addrspace(1)*, %opencl.sampler_t addrspace(2)*, <2 x float> noundef) local_unnamed_addr +declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), target("spirv.Sampler"), <2 x float> noundef) local_unnamed_addr -declare spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32) local_unnamed_addr +declare spir_func target("spirv.Sampler") @__translate_sampler_initializer(i32) local_unnamed_addr ; CHECK-SPIRV: %[[#sample_kernel_int]] = OpFunction %{{.*}} ; CHECK-SPIRV: %[[#InputImage:]] = OpFunctionParameter %{{.*}} @@ -73,17 +70,17 @@ declare spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializ ; CHECK-SPIRV: %[[#SampledImage6:]] = OpSampledImage %[[#SampledImageTy]] %[[#InputImage]] %[[#ConstSampler4]] ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SampledImage6]] -define dso_local spir_kernel void @sample_kernel_int(%opencl.image2d_ro_t addrspace(1)* %input, <2 x float> noundef %coords, <4 x i32> addrspace(1)* nocapture noundef writeonly %results, %opencl.sampler_t addrspace(2)* %argSampl) local_unnamed_addr { +define dso_local spir_kernel void @sample_kernel_int(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, <2 x float> noundef %coords, <4 x i32> addrspace(1)* nocapture noundef writeonly %results, target("spirv.Sampler") %argSampl) local_unnamed_addr { entry: - %0 = tail call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 32) - %call = tail call spir_func <4 x i32> @_Z11read_imagei14ocl_image2d_ro11ocl_samplerDv2_f(%opencl.image2d_ro_t addrspace(1)* %input, %opencl.sampler_t addrspace(2)* %0, <2 x float> noundef %coords) + %0 = tail call spir_func target("spirv.Sampler") @__translate_sampler_initializer(i32 32) + %call = tail call spir_func <4 x i32> @_Z11read_imagei14ocl_image2d_ro11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, target("spirv.Sampler") %0, <2 x float> noundef %coords) store <4 x i32> %call, <4 x i32> addrspace(1)* %results, align 16 - %call1 = tail call spir_func <4 x i32> @_Z11read_imagei14ocl_image2d_ro11ocl_samplerDv2_f(%opencl.image2d_ro_t addrspace(1)* %input, %opencl.sampler_t addrspace(2)* %argSampl, <2 x float> noundef %coords) + %call1 = tail call spir_func <4 x i32> @_Z11read_imagei14ocl_image2d_ro11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, target("spirv.Sampler") %argSampl, <2 x float> noundef %coords) store <4 x i32> %call1, <4 x i32> addrspace(1)* %results, align 16 - %1 = tail call spir_func %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 22) - %call2 = tail call spir_func <4 x i32> @_Z11read_imagei14ocl_image2d_ro11ocl_samplerDv2_f(%opencl.image2d_ro_t addrspace(1)* %input, %opencl.sampler_t addrspace(2)* %1, <2 x float> noundef %coords) + %1 = tail call spir_func target("spirv.Sampler") @__translate_sampler_initializer(i32 22) + %call2 = tail call spir_func <4 x i32> @_Z11read_imagei14ocl_image2d_ro11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %input, target("spirv.Sampler") %1, <2 x float> noundef %coords) store <4 x i32> %call2, <4 x i32> addrspace(1)* %results, align 16 ret void } -declare spir_func <4 x i32> @_Z11read_imagei14ocl_image2d_ro11ocl_samplerDv2_f(%opencl.image2d_ro_t addrspace(1)*, %opencl.sampler_t addrspace(2)*, <2 x float> noundef) local_unnamed_addr +declare spir_func <4 x i32> @_Z11read_imagei14ocl_image2d_ro11ocl_samplerDv2_f(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), target("spirv.Sampler"), <2 x float> noundef) local_unnamed_addr diff --git a/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll b/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll index cf0c65f597172..52b25a5913f09 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4 +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4 ;; TODO: We cannot check SPIR_V 1.1 and 1.4 simultaneously, implement additional ;; run with CHECK-SPIRV1_1. diff --git a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll index 4988ea41aa88e..6de610b2240da 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll @@ -1,23 +1,21 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: %[[#IMAGE_TYPE:]] = OpTypeImage ; CHECK-SPIRV: %[[#IMAGE_ARG:]] = OpFunctionParameter %[[#IMAGE_TYPE]] ; CHECK-SPIRV: %[[#]] = OpImageQuerySizeLod %[[#]] %[[#IMAGE_ARG]] ; CHECK-SPIRV: %[[#]] = OpImageQuerySizeLod %[[#]] %[[#IMAGE_ARG]] -%opencl.image2d_array_ro_t = type opaque - -define spir_kernel void @sample_kernel(%opencl.image2d_array_ro_t addrspace(1)* %input) { +define spir_kernel void @sample_kernel(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %input) { entry: - %call.tmp1 = call spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)* %input) + %call.tmp1 = call spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %input) %call.tmp2 = shufflevector <2 x i32> %call.tmp1, <2 x i32> undef, <3 x i32> - %call.tmp3 = call spir_func i64 @_Z20get_image_array_size20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)* %input) + %call.tmp3 = call spir_func i64 @_Z20get_image_array_size20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %input) %call.tmp4 = trunc i64 %call.tmp3 to i32 %call.tmp5 = insertelement <3 x i32> %call.tmp2, i32 %call.tmp4, i32 2 %call.old = extractelement <3 x i32> %call.tmp5, i32 0 ret void } -declare spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)*) +declare spir_func <2 x i32> @_Z13get_image_dim20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)) -declare spir_func i64 @_Z20get_image_array_size20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)*) +declare spir_func i64 @_Z20get_image_array_size20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll index edc571e6d05b8..52b7dac8866f6 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll @@ -18,7 +18,7 @@ ;; ) { ;; } -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-DAG: OpCapability Sampled1D ; CHECK-SPIRV-DAG: OpCapability SampledBuffer @@ -37,17 +37,6 @@ ; CHECK-SPIRV: %[[#SAMP_CONST:]] = OpConstantSampler %[[#SAMP]] None 0 Linear -%opencl.pipe_ro_t = type opaque -%opencl.pipe_wo_t = type opaque -%opencl.image3d_ro_t = type opaque -%opencl.image2d_array_ro_t = type opaque -%opencl.image1d_buffer_ro_t = type opaque -%opencl.image1d_ro_t = type opaque -%opencl.image1d_wo_t = type opaque -%opencl.image2d_rw_t = type opaque -%opencl.image2d_ro_t = type opaque -%opencl.sampler_t = type opaque - ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#PIPE_RD]] ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#PIPE_WR]] ; CHECK-SPIRV: %[[#]] = OpFunctionParameter %[[#IMG1D_RD]] @@ -60,28 +49,28 @@ ; CHECK-SPIRV: %[[#SAMP_ARG:]] = OpFunctionParameter %[[#SAMP]] define spir_kernel void @foo( - %opencl.pipe_ro_t addrspace(1)* nocapture %a, - %opencl.pipe_wo_t addrspace(1)* nocapture %b, - %opencl.image1d_ro_t addrspace(1)* nocapture %c1, - %opencl.image2d_ro_t addrspace(1)* nocapture %d1, - %opencl.image3d_ro_t addrspace(1)* nocapture %e1, - %opencl.image2d_array_ro_t addrspace(1)* nocapture %f1, - %opencl.image1d_buffer_ro_t addrspace(1)* nocapture %g1, - %opencl.image1d_wo_t addrspace(1)* nocapture %c2, - %opencl.image2d_rw_t addrspace(1)* nocapture %d3, - %opencl.sampler_t addrspace(2)* %s) { + target("spirv.Pipe", 0) %a, + target("spirv.Pipe", 1) %b, + target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %c1, + target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %d1, + target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %e1, + target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %f1, + target("spirv.Image", void, 5, 0, 0, 0, 0, 0, 0) %g1, + target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) %c2, + target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 2) %d3, + target("spirv.Sampler") %s) { entry: ; CHECK-SPIRV: %[[#SAMPIMG_VAR1:]] = OpSampledImage %[[#SAMPIMG]] %[[#IMG_ARG]] %[[#SAMP_ARG]] ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SAMPIMG_VAR1]] - %.tmp = call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv4_if(%opencl.image2d_ro_t addrspace(1)* %d1, %opencl.sampler_t addrspace(2)* %s, <4 x i32> zeroinitializer, float 1.000000e+00) + %.tmp = call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv4_if(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %d1, target("spirv.Sampler") %s, <4 x i32> zeroinitializer, float 1.000000e+00) ; CHECK-SPIRV: %[[#SAMPIMG_VAR2:]] = OpSampledImage %[[#SAMPIMG]] %[[#IMG_ARG]] %[[#SAMP_CONST]] ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#SAMPIMG_VAR2]] - %0 = call %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32 32) - %.tmp2 = call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv4_if(%opencl.image2d_ro_t addrspace(1)* %d1, %opencl.sampler_t addrspace(2)* %0, <4 x i32> zeroinitializer, float 1.000000e+00) + %0 = call target("spirv.Sampler") @__translate_sampler_initializer(i32 32) + %.tmp2 = call spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv4_if(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %d1, target("spirv.Sampler") %0, <4 x i32> zeroinitializer, float 1.000000e+00) ret void } -declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv4_if(%opencl.image2d_ro_t addrspace(1)*, %opencl.sampler_t addrspace(2)*, <4 x i32>, float) +declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv4_if(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), target("spirv.Sampler"), <4 x i32>, float) -declare %opencl.sampler_t addrspace(2)* @__translate_sampler_initializer(i32) +declare target("spirv.Sampler") @__translate_sampler_initializer(i32) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll b/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll index 150d5459ec25e..9054454879cc2 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: OpTypeDeviceEvent ; CHECK-SPIRV: OpFunction @@ -19,29 +19,27 @@ ;; release_event(e1); ;; } -%opencl.clk_event_t = type opaque - define dso_local spir_kernel void @clk_event_t_test(i32 addrspace(1)* nocapture noundef writeonly %res, i8 addrspace(1)* noundef %prof) local_unnamed_addr { entry: - %call = call spir_func %opencl.clk_event_t* @_Z17create_user_eventv() - %call1 = call spir_func zeroext i1 @_Z14is_valid_event12ocl_clkevent(%opencl.clk_event_t* %call) + %call = call spir_func target("spirv.DeviceEvent") @_Z17create_user_eventv() + %call1 = call spir_func zeroext i1 @_Z14is_valid_event12ocl_clkevent(target("spirv.DeviceEvent") %call) %conv = zext i1 %call1 to i32 store i32 %conv, i32 addrspace(1)* %res, align 4 - call spir_func void @_Z12retain_event12ocl_clkevent(%opencl.clk_event_t* %call) - call spir_func void @_Z21set_user_event_status12ocl_clkeventi(%opencl.clk_event_t* %call, i32 noundef -42) - call spir_func void @_Z28capture_event_profiling_info12ocl_clkeventiPU3AS1v(%opencl.clk_event_t* %call, i32 noundef 1, i8 addrspace(1)* noundef %prof) - call spir_func void @_Z13release_event12ocl_clkevent(%opencl.clk_event_t* %call) + call spir_func void @_Z12retain_event12ocl_clkevent(target("spirv.DeviceEvent") %call) + call spir_func void @_Z21set_user_event_status12ocl_clkeventi(target("spirv.DeviceEvent") %call, i32 noundef -42) + call spir_func void @_Z28capture_event_profiling_info12ocl_clkeventiPU3AS1v(target("spirv.DeviceEvent") %call, i32 noundef 1, i8 addrspace(1)* noundef %prof) + call spir_func void @_Z13release_event12ocl_clkevent(target("spirv.DeviceEvent") %call) ret void } -declare spir_func %opencl.clk_event_t* @_Z17create_user_eventv() local_unnamed_addr +declare spir_func target("spirv.DeviceEvent") @_Z17create_user_eventv() local_unnamed_addr -declare spir_func zeroext i1 @_Z14is_valid_event12ocl_clkevent(%opencl.clk_event_t*) local_unnamed_addr +declare spir_func zeroext i1 @_Z14is_valid_event12ocl_clkevent(target("spirv.DeviceEvent")) local_unnamed_addr -declare spir_func void @_Z12retain_event12ocl_clkevent(%opencl.clk_event_t*) local_unnamed_addr +declare spir_func void @_Z12retain_event12ocl_clkevent(target("spirv.DeviceEvent")) local_unnamed_addr -declare spir_func void @_Z21set_user_event_status12ocl_clkeventi(%opencl.clk_event_t*, i32 noundef) local_unnamed_addr +declare spir_func void @_Z21set_user_event_status12ocl_clkeventi(target("spirv.DeviceEvent"), i32 noundef) local_unnamed_addr -declare spir_func void @_Z28capture_event_profiling_info12ocl_clkeventiPU3AS1v(%opencl.clk_event_t*, i32 noundef, i8 addrspace(1)* noundef) local_unnamed_addr +declare spir_func void @_Z28capture_event_profiling_info12ocl_clkeventiPU3AS1v(target("spirv.DeviceEvent"), i32 noundef, i8 addrspace(1)* noundef) local_unnamed_addr -declare spir_func void @_Z13release_event12ocl_clkevent(%opencl.clk_event_t*) local_unnamed_addr +declare spir_func void @_Z13release_event12ocl_clkevent(target("spirv.DeviceEvent")) local_unnamed_addr diff --git a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll index ee91f0bde619e..6de03dd165187 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer1:]] "__device_side_enqueue_block_invoke_kernel" ; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer2:]] "__device_side_enqueue_block_invoke_2_kernel" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll b/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll index 33a11f31b2e15..dc307c70612eb 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s ;; Types: ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 @@ -26,57 +26,49 @@ ; CHECK: %[[#IMAGE2D_ARRAY_DEPTH:]] = OpLoad %[[#IMAGE2D_ARRAY_DEPTH_T]] ; CHECK-NEXT: %[[#]] = OpImageQueryLevels %[[#INT]] %[[#IMAGE2D_ARRAY_DEPTH]] -%opencl.image1d_ro_t = type opaque -%opencl.image2d_ro_t = type opaque -%opencl.image3d_ro_t = type opaque -%opencl.image1d_array_ro_t = type opaque -%opencl.image2d_array_ro_t = type opaque -%opencl.image2d_depth_ro_t = type opaque -%opencl.image2d_array_depth_ro_t = type opaque - -define spir_func void @testimage1d(%opencl.image1d_ro_t addrspace(1)* %img1, %opencl.image2d_ro_t addrspace(1)* %img2, %opencl.image3d_ro_t addrspace(1)* %img3, %opencl.image1d_array_ro_t addrspace(1)* %img4, %opencl.image2d_array_ro_t addrspace(1)* %img5, %opencl.image2d_depth_ro_t addrspace(1)* %img6, %opencl.image2d_array_depth_ro_t addrspace(1)* %img7) { +define spir_func void @testimage1d(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %img1, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %img2, target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %img3, target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0) %img4, target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %img5, target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0) %img6, target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0) %img7) { entry: - %img1.addr = alloca %opencl.image1d_ro_t addrspace(1)*, align 4 - %img2.addr = alloca %opencl.image2d_ro_t addrspace(1)*, align 4 - %img3.addr = alloca %opencl.image3d_ro_t addrspace(1)*, align 4 - %img4.addr = alloca %opencl.image1d_array_ro_t addrspace(1)*, align 4 - %img5.addr = alloca %opencl.image2d_array_ro_t addrspace(1)*, align 4 - %img6.addr = alloca %opencl.image2d_depth_ro_t addrspace(1)*, align 4 - %img7.addr = alloca %opencl.image2d_array_depth_ro_t addrspace(1)*, align 4 - store %opencl.image1d_ro_t addrspace(1)* %img1, %opencl.image1d_ro_t addrspace(1)** %img1.addr, align 4 - store %opencl.image2d_ro_t addrspace(1)* %img2, %opencl.image2d_ro_t addrspace(1)** %img2.addr, align 4 - store %opencl.image3d_ro_t addrspace(1)* %img3, %opencl.image3d_ro_t addrspace(1)** %img3.addr, align 4 - store %opencl.image1d_array_ro_t addrspace(1)* %img4, %opencl.image1d_array_ro_t addrspace(1)** %img4.addr, align 4 - store %opencl.image2d_array_ro_t addrspace(1)* %img5, %opencl.image2d_array_ro_t addrspace(1)** %img5.addr, align 4 - store %opencl.image2d_depth_ro_t addrspace(1)* %img6, %opencl.image2d_depth_ro_t addrspace(1)** %img6.addr, align 4 - store %opencl.image2d_array_depth_ro_t addrspace(1)* %img7, %opencl.image2d_array_depth_ro_t addrspace(1)** %img7.addr, align 4 - %0 = load %opencl.image1d_ro_t addrspace(1)*, %opencl.image1d_ro_t addrspace(1)** %img1.addr, align 4 - %call = call spir_func i32 @_Z24get_image_num_mip_levels14ocl_image1d_ro(%opencl.image1d_ro_t addrspace(1)* %0) - %1 = load %opencl.image2d_ro_t addrspace(1)*, %opencl.image2d_ro_t addrspace(1)** %img2.addr, align 4 - %call1 = call spir_func i32 @_Z24get_image_num_mip_levels14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)* %1) - %2 = load %opencl.image3d_ro_t addrspace(1)*, %opencl.image3d_ro_t addrspace(1)** %img3.addr, align 4 - %call2 = call spir_func i32 @_Z24get_image_num_mip_levels14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)* %2) - %3 = load %opencl.image1d_array_ro_t addrspace(1)*, %opencl.image1d_array_ro_t addrspace(1)** %img4.addr, align 4 - %call3 = call spir_func i32 @_Z24get_image_num_mip_levels20ocl_image1d_array_ro(%opencl.image1d_array_ro_t addrspace(1)* %3) - %4 = load %opencl.image2d_array_ro_t addrspace(1)*, %opencl.image2d_array_ro_t addrspace(1)** %img5.addr, align 4 - %call4 = call spir_func i32 @_Z24get_image_num_mip_levels20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)* %4) - %5 = load %opencl.image2d_depth_ro_t addrspace(1)*, %opencl.image2d_depth_ro_t addrspace(1)** %img6.addr, align 4 - %call5 = call spir_func i32 @_Z24get_image_num_mip_levels20ocl_image2d_depth_ro(%opencl.image2d_depth_ro_t addrspace(1)* %5) - %6 = load %opencl.image2d_array_depth_ro_t addrspace(1)*, %opencl.image2d_array_depth_ro_t addrspace(1)** %img7.addr, align 4 - %call6 = call spir_func i32 @_Z24get_image_num_mip_levels26ocl_image2d_array_depth_ro(%opencl.image2d_array_depth_ro_t addrspace(1)* %6) + %img1.addr = alloca target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0), align 4 + %img2.addr = alloca target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), align 4 + %img3.addr = alloca target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), align 4 + %img4.addr = alloca target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0), align 4 + %img5.addr = alloca target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0), align 4 + %img6.addr = alloca target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0), align 4 + %img7.addr = alloca target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0), align 4 + store target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %img1, target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0)* %img1.addr, align 4 + store target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %img2, target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)* %img2.addr, align 4 + store target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %img3, target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0)* %img3.addr, align 4 + store target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0) %img4, target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0)* %img4.addr, align 4 + store target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %img5, target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)* %img5.addr, align 4 + store target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0) %img6, target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0)* %img6.addr, align 4 + store target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0) %img7, target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0)* %img7.addr, align 4 + %0 = load target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0), target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0)* %img1.addr, align 4 + %call = call spir_func i32 @_Z24get_image_num_mip_levels14ocl_image1d_ro(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %0) + %1 = load target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)* %img2.addr, align 4 + %call1 = call spir_func i32 @_Z24get_image_num_mip_levels14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %1) + %2 = load target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0)* %img3.addr, align 4 + %call2 = call spir_func i32 @_Z24get_image_num_mip_levels14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) %2) + %3 = load target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0), target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0)* %img4.addr, align 4 + %call3 = call spir_func i32 @_Z24get_image_num_mip_levels20ocl_image1d_array_ro(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0) %3) + %4 = load target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0), target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)* %img5.addr, align 4 + %call4 = call spir_func i32 @_Z24get_image_num_mip_levels20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %4) + %5 = load target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0), target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0)* %img6.addr, align 4 + %call5 = call spir_func i32 @_Z24get_image_num_mip_levels20ocl_image2d_depth_ro(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0) %5) + %6 = load target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0), target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0)* %img7.addr, align 4 + %call6 = call spir_func i32 @_Z24get_image_num_mip_levels26ocl_image2d_array_depth_ro(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0) %6) ret void } -declare spir_func i32 @_Z24get_image_num_mip_levels14ocl_image1d_ro(%opencl.image1d_ro_t addrspace(1)*) +declare spir_func i32 @_Z24get_image_num_mip_levels14ocl_image1d_ro(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z24get_image_num_mip_levels14ocl_image2d_ro(%opencl.image2d_ro_t addrspace(1)*) +declare spir_func i32 @_Z24get_image_num_mip_levels14ocl_image2d_ro(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z24get_image_num_mip_levels14ocl_image3d_ro(%opencl.image3d_ro_t addrspace(1)*) +declare spir_func i32 @_Z24get_image_num_mip_levels14ocl_image3d_ro(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z24get_image_num_mip_levels20ocl_image1d_array_ro(%opencl.image1d_array_ro_t addrspace(1)*) +declare spir_func i32 @_Z24get_image_num_mip_levels20ocl_image1d_array_ro(target("spirv.Image", void, 0, 0, 1, 0, 0, 0, 0)) -declare spir_func i32 @_Z24get_image_num_mip_levels20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)*) +declare spir_func i32 @_Z24get_image_num_mip_levels20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)) -declare spir_func i32 @_Z24get_image_num_mip_levels20ocl_image2d_depth_ro(%opencl.image2d_depth_ro_t addrspace(1)*) +declare spir_func i32 @_Z24get_image_num_mip_levels20ocl_image2d_depth_ro(target("spirv.Image", void, 1, 1, 0, 0, 0, 0, 0)) -declare spir_func i32 @_Z24get_image_num_mip_levels26ocl_image2d_array_depth_ro(%opencl.image2d_array_depth_ro_t addrspace(1)*) +declare spir_func i32 @_Z24get_image_num_mip_levels26ocl_image2d_array_depth_ro(target("spirv.Image", void, 1, 1, 1, 0, 0, 0, 0)) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll b/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll index 01faff880e49f..96b275956178c 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4 +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4 ;; There are no blocks in SPIR-V. Therefore they are translated into regular ;; functions. An LLVM module which uses blocks, also contains some auxiliary diff --git a/llvm/test/CodeGen/SPIRV/transcoding/image_get_size_with_access_qualifiers.ll b/llvm/test/CodeGen/SPIRV/transcoding/image_get_size_with_access_qualifiers.ll index 4f461f3c43d20..a39a3ce038d6a 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/image_get_size_with_access_qualifiers.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/image_get_size_with_access_qualifiers.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-DAG: %[[#IntTyID:]] = OpTypeInt ; CHECK-SPIRV-DAG: %[[#VoidTyID:]] = OpTypeVoid @@ -7,12 +7,10 @@ ; CHECK-SPIRV: %[[#ImageArgID:]] = OpFunctionParameter %[[#ImageTyID]] ; CHECK-SPIRV: %[[#]] = OpImageQuerySizeLod %[[#VectorTyID]] %[[#ImageArgID]] -%opencl.image2d_array_ro_t = type opaque - -define spir_kernel void @sample_kernel(%opencl.image2d_array_ro_t addrspace(1)* %input) { +define spir_kernel void @sample_kernel(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %input) { entry: - %call = call spir_func i32 @_Z15get_image_width20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)* %input) + %call = call spir_func i32 @_Z15get_image_width20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0) %input) ret void } -declare spir_func i32 @_Z15get_image_width20ocl_image2d_array_ro(%opencl.image2d_array_ro_t addrspace(1)*) +declare spir_func i32 @_Z15get_image_width20ocl_image2d_array_ro(target("spirv.Image", void, 1, 0, 1, 0, 0, 0, 0)) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/image_with_access_qualifiers.ll b/llvm/test/CodeGen/SPIRV/transcoding/image_with_access_qualifiers.ll index 09c4860b86f36..bfc3f04499347 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/image_with_access_qualifiers.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/image_with_access_qualifiers.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-DAG: OpCapability ImageReadWrite ; CHECK-SPIRV-DAG: OpCapability LiteralSampler @@ -9,15 +9,13 @@ ; CHECK-SPIRV-DAG: %[[#ResID:]] = OpSampledImage %[[#TySampledImageID]] ; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod %[[#]] %[[#ResID]] -%opencl.image1d_rw_t = type opaque - -define spir_func void @sampFun(%opencl.image1d_rw_t addrspace(1)* %image) { +define spir_func void @sampFun(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2) %image) { entry: - %image.addr = alloca %opencl.image1d_rw_t addrspace(1)*, align 4 - store %opencl.image1d_rw_t addrspace(1)* %image, %opencl.image1d_rw_t addrspace(1)** %image.addr, align 4 - %0 = load %opencl.image1d_rw_t addrspace(1)*, %opencl.image1d_rw_t addrspace(1)** %image.addr, align 4 - %call = call spir_func <4 x float> @_Z11read_imagef14ocl_image1d_rw11ocl_sampleri(%opencl.image1d_rw_t addrspace(1)* %0, i32 8, i32 2) + %image.addr = alloca target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2), align 4 + store target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2) %image, ptr %image.addr, align 4 + %0 = load target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2), ptr %image.addr, align 4 + %call = call spir_func <4 x float> @_Z11read_imagef14ocl_image1d_rw11ocl_sampleri(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2) %0, i32 8, i32 2) ret void } -declare spir_func <4 x float> @_Z11read_imagef14ocl_image1d_rw11ocl_sampleri(%opencl.image1d_rw_t addrspace(1)*, i32, i32) +declare spir_func <4 x float> @_Z11read_imagef14ocl_image1d_rw11ocl_sampleri(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 2), i32, i32) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/optional-core-features-multiple.ll b/llvm/test/CodeGen/SPIRV/transcoding/optional-core-features-multiple.ll index a5254cdd6e7ef..fe4396db64cfe 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/optional-core-features-multiple.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/optional-core-features-multiple.ll @@ -4,13 +4,11 @@ ;; kernel void test(read_only image2d_t img) {} ;; ----------------------------------------------- -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV - -%opencl.image2d_t = type opaque +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV @d = addrspace(1) global double 1.000000e+00, align 8 -define spir_kernel void @test(%opencl.image2d_t addrspace(1)* nocapture %img) { +define spir_kernel void @test(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) %img) { entry: ret void } diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spec_const.ll b/llvm/test/CodeGen/SPIRV/transcoding/spec_const.ll index 6b76d3754366e..9fd7460885241 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/spec_const.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/spec_const.ll @@ -1,8 +1,8 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; CHECK-SPIRV-NOT: OpCapability Matrix ; CHECK-SPIRV-NOT: OpCapability Shader -; CHECK-SPIRV: OpCapability Float16Buffer +; CHECK-SPIRV: OpCapability Kernel ; CHECK-SPIRV-DAG: OpDecorate %[[#SC0:]] SpecId 0 ; CHECK-SPIRV-DAG: OpDecorate %[[#SC1:]] SpecId 1 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll index 079a2a6ce0e5e..3c45fdd0481dc 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll @@ -1,30 +1,29 @@ -; RUN: llc -O0 -opaque-pointers=0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; -; CHECK-SPIRV-DAG: %[[#i32:]] = OpTypeInt 32 0 ; CHECK-SPIRV-DAG: %[[#i8:]] = OpTypeInt 8 0 +; CHECK-SPIRV-DAG: %[[#i32:]] = OpTypeInt 32 0 ; CHECK-SPIRV-DAG: %[[#one:]] = OpConstant %[[#i32]] 1 ; CHECK-SPIRV-DAG: %[[#two:]] = OpConstant %[[#i32]] 2 ; CHECK-SPIRV-DAG: %[[#three:]] = OpConstant %[[#i32]] 3 ; CHECK-SPIRV-DAG: %[[#i32x3:]] = OpTypeArray %[[#i32]] %[[#three]] -; CHECK-SPIRV-DAG: %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]] +; CHECK-SPIRV-DAG: %[[#test_arr_init:]] = OpConstantComposite %[[#i32x3]] %[[#one]] %[[#two]] %[[#three]] +; CHECK-SPIRV-DAG: %[[#twelve:]] = OpConstant %[[#i32]] 12 ; CHECK-SPIRV-DAG: %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]] -; CHECK-SPIRV-DAG: %[[#i8_ptr:]] = OpTypePointer Function %[[#i8]] -; CHECK-SPIRV-DAG: %[[#const_i8_ptr:]] = OpTypePointer UniformConstant %[[#i8]] -; CHECK-SPIRV: %[[#test_arr_init:]] = OpConstantComposite %[[#i32x3]] %[[#one]] %[[#two]] %[[#three]] -; CHECK-SPIRV: %[[#twelve:]] = OpConstant %[[#i32]] 12 + ; CHECK-SPIRV: %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]] ; CHECK-SPIRV: %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]] -; + +; CHECK-SPIRV-DAG: %[[#i8_ptr:]] = OpTypePointer Function %[[#i8]] +; CHECK-SPIRV-DAG: %[[#const_i8_ptr:]] = OpTypePointer UniformConstant %[[#i8]] +; CHECK-SPIRV-DAG: %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]] + ; CHECK-SPIRV: %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function ; CHECK-SPIRV: %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function -; ; CHECK-SPIRV: %[[#arr_i8_ptr:]] = OpBitcast %[[#i8_ptr]] %[[#arr]] -; CHECK-SPIRV: %[[#test_arr_const_i8_ptr:]] = OpBitcast %[[#const_i8_ptr]] %[[#test_arr]] -; CHECK-SPIRV: OpCopyMemorySized %[[#arr_i8_ptr]] %[[#test_arr_const_i8_ptr]] %[[#twelve]] Aligned 4 -; +; CHECK-SPIRV: OpCopyMemorySized %[[#arr_i8_ptr]] %[[#test_arr]] %[[#twelve]] Aligned 4 ; CHECK-SPIRV: %[[#arr2_i8_ptr:]] = OpBitcast %[[#i8_ptr]] %[[#arr2]] -; CHECK-SPIRV: %[[#test_arr2_const_i8_ptr:]] = OpBitcast %[[#const_i8_ptr]] %[[#test_arr2]] -; CHECK-SPIRV: OpCopyMemorySized %[[#arr2_i8_ptr]] %[[#test_arr2_const_i8_ptr]] %[[#twelve]] Aligned 4 +; CHECK-SPIRV: OpCopyMemorySized %[[#arr2_i8_ptr]] %[[#test_arr2]] %[[#twelve]] Aligned 4 + @__const.test.arr = private unnamed_addr addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3], align 4 @__const.test.arr2 = private unnamed_addr addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3], align 4 diff --git a/llvm/test/CodeGen/SystemZ/int-conv-01.ll b/llvm/test/CodeGen/SystemZ/int-conv-01.ll index 91ef0802223ed..491fb95cddf72 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-01.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-01.ll @@ -108,7 +108,7 @@ define i32 @f9(i64 %src, i64 %index) { ; to use LB if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lb {{%r[0-9]+}}, 191(%r15) +; CHECK: lb {{%r[0-9]+}}, 183(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-02.ll b/llvm/test/CodeGen/SystemZ/int-conv-02.ll index 9134e5bdbe17b..6c33ee1098ff7 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-02.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-02.ll @@ -118,7 +118,7 @@ define i32 @f10(i64 %src, i64 %index) { ; to use LLC if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llc {{%r[0-9]+}}, 187(%r15) +; CHECK: llc {{%r[0-9]+}}, 179(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-03.ll b/llvm/test/CodeGen/SystemZ/int-conv-03.ll index 9c364651edf70..41f2f87186a5e 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-03.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-03.ll @@ -108,7 +108,7 @@ define i64 @f9(i64 %src, i64 %index) { ; to use LGB if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lgb {{%r[0-9]+}}, 167(%r15) +; CHECK: lgb {{%r[0-9]+}}, 199(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-04.ll b/llvm/test/CodeGen/SystemZ/int-conv-04.ll index c589d0d0d4cfe..5c808920ff25e 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-04.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-04.ll @@ -117,7 +117,7 @@ define i64 @f10(i64 %src, i64 %index) { ; to use LLGC if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llgc {{%r[0-9]+}}, 167(%r15) +; CHECK: llgc {{%r[0-9]+}}, 199(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-06.ll b/llvm/test/CodeGen/SystemZ/int-conv-06.ll index ff91c91f8f14c..1163e1e04ce6c 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-06.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-06.ll @@ -118,7 +118,7 @@ define i32 @f10(i64 %src, i64 %index) { ; to use LLH if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llh {{%r[0-9]+}}, 186(%r15) +; CHECK: llh {{%r[0-9]+}}, 178(%r15) ; CHECK: br %r14 %val0 = load volatile i32, ptr %ptr %val1 = load volatile i32, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-07.ll b/llvm/test/CodeGen/SystemZ/int-conv-07.ll index a36154fd78a8f..bc2895da2cde0 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-07.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-07.ll @@ -108,7 +108,7 @@ define i64 @f9(i64 %src, i64 %index) { ; to use LGH if possible. define void @f10(ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: lgh {{%r[0-9]+}}, 166(%r15) +; CHECK: lgh {{%r[0-9]+}}, 198(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/int-conv-08.ll b/llvm/test/CodeGen/SystemZ/int-conv-08.ll index 0abe41f299e75..82f2bcea4af78 100644 --- a/llvm/test/CodeGen/SystemZ/int-conv-08.ll +++ b/llvm/test/CodeGen/SystemZ/int-conv-08.ll @@ -117,7 +117,7 @@ define i64 @f10(i64 %src, i64 %index) { ; to use LLGH if possible. define void @f11(ptr %ptr) { ; CHECK-LABEL: f11: -; CHECK: llgh {{%r[0-9]+}}, 166(%r15) +; CHECK: llgh {{%r[0-9]+}}, 198(%r15) ; CHECK: br %r14 %val0 = load volatile i64, ptr %ptr %val1 = load volatile i64, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/rot-02.ll b/llvm/test/CodeGen/SystemZ/rot-02.ll index 84fac6af5fcaa..aa9d841703552 100644 --- a/llvm/test/CodeGen/SystemZ/rot-02.ll +++ b/llvm/test/CodeGen/SystemZ/rot-02.ll @@ -2,8 +2,8 @@ ; Test removal of AND operations that don't affect last 6 bits of rotate amount ; operand. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefixes=CHECK,CHECK-LV -; RUN: llc < %s -mtriple=s390x-linux-gnu -early-live-intervals | FileCheck %s -check-prefixes=CHECK,CHECK-LIS +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -early-live-intervals | FileCheck %s ; Test that AND is not removed when some lower 5 bits are not set. define i32 @f1(i32 %val, i32 %amt) { @@ -76,20 +76,12 @@ define i64 @f4(i64 %val, i64 %amt) { ; Test that AND is not entirely removed if the result is reused. define i32 @f5(i32 %val, i32 %amt) { -; CHECK-LV-LABEL: f5: -; CHECK-LV: # %bb.0: -; CHECK-LV-NEXT: rll %r2, %r2, 0(%r3) -; CHECK-LV-NEXT: nilf %r3, 63 -; CHECK-LV-NEXT: ar %r2, %r3 -; CHECK-LV-NEXT: br %r14 -; -; CHECK-LIS-LABEL: f5: -; CHECK-LIS: # %bb.0: -; CHECK-LIS-NEXT: rll %r0, %r2, 0(%r3) -; CHECK-LIS-NEXT: nilf %r3, 63 -; CHECK-LIS-NEXT: ar %r3, %r0 -; CHECK-LIS-NEXT: lr %r2, %r3 -; CHECK-LIS-NEXT: br %r14 +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: rll %r2, %r2, 0(%r3) +; CHECK-NEXT: nilf %r3, 63 +; CHECK-NEXT: ar %r2, %r3 +; CHECK-NEXT: br %r14 %and = and i32 %amt, 63 %inv = sub i32 32, %and diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll index 14ceee58ef55c..40207090fda66 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll @@ -241,11 +241,11 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrh.u16 q0, [r5] ; CHECK-NEXT: vshl.i16 q1, q0, #3 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmla.i16 q4, q1, r3 -; CHECK-NEXT: vmov.f64 d6, d4 -; CHECK-NEXT: vmov.f64 d7, d5 +; CHECK-NEXT: vmla.i16 q3, q1, r3 +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f64 d9, d5 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vshr.u16 q2, q0, #9 ; CHECK-NEXT: vshr.u16 q0, q0, #3 @@ -253,17 +253,17 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmla.i16 q1, q0, r3 ; CHECK-NEXT: vand q2, q2, q5 -; CHECK-NEXT: vshr.u16 q0, q4, #11 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vshr.u16 q0, q3, #11 +; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vshr.u16 q1, q1, #5 -; CHECK-NEXT: vmla.i16 q4, q2, r3 +; CHECK-NEXT: vmla.i16 q3, q2, r3 ; CHECK-NEXT: vand q1, q1, q7 ; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vand q1, q4, q6 +; CHECK-NEXT: vand q1, q3, q6 ; CHECK-NEXT: vorr q0, q0, q1 ; CHECK-NEXT: vstrh.16 q0, [r5], #16 -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: vmov.f64 d5, d7 +; CHECK-NEXT: vmov.f64 d4, d8 +; CHECK-NEXT: vmov.f64 d5, d9 ; CHECK-NEXT: letp lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index 3a477f987cee6..c299b62a4c942 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS -; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) { ; CHECK-LABEL: sqrt_float32_t: @@ -1087,37 +1087,21 @@ entry: } define arm_aapcs_vfpcc <2 x double> @copysign_float64_t(<2 x double> %src1, <2 x double> %src2) { -; CHECK-LV-LABEL: copysign_float64_t: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .save {r7, lr} -; CHECK-LV-NEXT: push {r7, lr} -; CHECK-LV-NEXT: vmov r0, r1, d3 -; CHECK-LV-NEXT: vmov r0, lr, d2 -; CHECK-LV-NEXT: vmov r0, r3, d1 -; CHECK-LV-NEXT: vmov r12, r2, d0 -; CHECK-LV-NEXT: lsrs r1, r1, #31 -; CHECK-LV-NEXT: bfi r3, r1, #31, #1 -; CHECK-LV-NEXT: lsr.w r1, lr, #31 -; CHECK-LV-NEXT: bfi r2, r1, #31, #1 -; CHECK-LV-NEXT: vmov d1, r0, r3 -; CHECK-LV-NEXT: vmov d0, r12, r2 -; CHECK-LV-NEXT: pop {r7, pc} -; -; CHECK-LIS-LABEL: copysign_float64_t: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .save {r4, lr} -; CHECK-LIS-NEXT: push {r4, lr} -; CHECK-LIS-NEXT: vmov r0, r12, d3 -; CHECK-LIS-NEXT: vmov r0, lr, d2 -; CHECK-LIS-NEXT: vmov r4, r3, d1 -; CHECK-LIS-NEXT: vmov r1, r2, d0 -; CHECK-LIS-NEXT: lsr.w r0, r12, #31 -; CHECK-LIS-NEXT: bfi r3, r0, #31, #1 -; CHECK-LIS-NEXT: lsr.w r0, lr, #31 -; CHECK-LIS-NEXT: bfi r2, r0, #31, #1 -; CHECK-LIS-NEXT: vmov d1, r4, r3 -; CHECK-LIS-NEXT: vmov d0, r1, r2 -; CHECK-LIS-NEXT: pop {r4, pc} +; CHECK-LABEL: copysign_float64_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, lr, d2 +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: vmov r12, r2, d0 +; CHECK-NEXT: lsrs r1, r1, #31 +; CHECK-NEXT: bfi r3, r1, #31, #1 +; CHECK-NEXT: lsr.w r1, lr, #31 +; CHECK-NEXT: bfi r2, r1, #31, #1 +; CHECK-NEXT: vmov d1, r0, r3 +; CHECK-NEXT: vmov d0, r12, r2 +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> %src1, <2 x double> %src2) ret <2 x double> %0 @@ -1153,4 +1137,3 @@ declare <2 x double> @llvm.log2.v2f64(<2 x double>) declare <2 x double> @llvm.log10.v2f64(<2 x double>) declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) - diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index 6e644c58687fa..f4643f8c6c4a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -223,31 +223,18 @@ entry: } define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { -; CHECK-LV-LABEL: shuffle3_i16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: vmov q1, q0 -; CHECK-LV-NEXT: vmovx.f16 s2, s5 -; CHECK-LV-NEXT: vmovx.f16 s0, s4 -; CHECK-LV-NEXT: vins.f16 s5, s4 -; CHECK-LV-NEXT: vins.f16 s2, s0 -; CHECK-LV-NEXT: vmov.f32 s3, s5 -; CHECK-LV-NEXT: vmovx.f16 s1, s7 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vins.f16 s1, s7 -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: shuffle3_i16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: vmovx.f16 s5, s3 -; CHECK-LIS-NEXT: vmovx.f16 s6, s1 -; CHECK-LIS-NEXT: vmovx.f16 s4, s0 -; CHECK-LIS-NEXT: vins.f16 s1, s0 -; CHECK-LIS-NEXT: vins.f16 s6, s4 -; CHECK-LIS-NEXT: vins.f16 s5, s3 -; CHECK-LIS-NEXT: vmov.f32 s7, s1 -; CHECK-LIS-NEXT: vmov.f32 s4, s2 -; CHECK-LIS-NEXT: vmov q0, q1 -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: shuffle3_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vins.f16 s5, s4 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s1, s7 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> ret <8 x i16> %out @@ -491,79 +478,42 @@ entry: } define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) { -; CHECK-LV-LABEL: shuffle3_i8: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: vmov q1, q0 -; CHECK-LV-NEXT: vmov.u8 r0, q0[4] -; CHECK-LV-NEXT: vmov.8 q0[0], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[5] -; CHECK-LV-NEXT: vmov.8 q0[1], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[15] -; CHECK-LV-NEXT: vmov.8 q0[2], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[7] -; CHECK-LV-NEXT: vmov.8 q0[3], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[14] -; CHECK-LV-NEXT: vmov.8 q0[4], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[9] -; CHECK-LV-NEXT: vmov.8 q0[5], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[6] -; CHECK-LV-NEXT: vmov.8 q0[6], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[3] -; CHECK-LV-NEXT: vmov.8 q0[7], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[10] -; CHECK-LV-NEXT: vmov.8 q0[8], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[12] -; CHECK-LV-NEXT: vmov.8 q0[9], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[1] -; CHECK-LV-NEXT: vmov.8 q0[10], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[13] -; CHECK-LV-NEXT: vmov.8 q0[11], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[2] -; CHECK-LV-NEXT: vmov.8 q0[12], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[8] -; CHECK-LV-NEXT: vmov.8 q0[13], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[0] -; CHECK-LV-NEXT: vmov.8 q0[14], r0 -; CHECK-LV-NEXT: vmov.u8 r0, q1[11] -; CHECK-LV-NEXT: vmov.8 q0[15], r0 -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: shuffle3_i8: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: vmov.u8 r0, q0[4] -; CHECK-LIS-NEXT: vmov.8 q1[0], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[5] -; CHECK-LIS-NEXT: vmov.8 q1[1], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[15] -; CHECK-LIS-NEXT: vmov.8 q1[2], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[7] -; CHECK-LIS-NEXT: vmov.8 q1[3], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[14] -; CHECK-LIS-NEXT: vmov.8 q1[4], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[9] -; CHECK-LIS-NEXT: vmov.8 q1[5], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[6] -; CHECK-LIS-NEXT: vmov.8 q1[6], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[3] -; CHECK-LIS-NEXT: vmov.8 q1[7], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[10] -; CHECK-LIS-NEXT: vmov.8 q1[8], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[12] -; CHECK-LIS-NEXT: vmov.8 q1[9], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[1] -; CHECK-LIS-NEXT: vmov.8 q1[10], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[13] -; CHECK-LIS-NEXT: vmov.8 q1[11], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[2] -; CHECK-LIS-NEXT: vmov.8 q1[12], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[8] -; CHECK-LIS-NEXT: vmov.8 q1[13], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[0] -; CHECK-LIS-NEXT: vmov.8 q1[14], r0 -; CHECK-LIS-NEXT: vmov.u8 r0, q0[11] -; CHECK-LIS-NEXT: vmov.8 q1[15], r0 -; CHECK-LIS-NEXT: vmov q0, q1 -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: shuffle3_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> ret <16 x i8> %out @@ -1195,31 +1145,18 @@ entry: } define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { -; CHECK-LV-LABEL: shuffle3_f16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: vmov q1, q0 -; CHECK-LV-NEXT: vmovx.f16 s2, s5 -; CHECK-LV-NEXT: vmovx.f16 s0, s4 -; CHECK-LV-NEXT: vins.f16 s5, s4 -; CHECK-LV-NEXT: vins.f16 s2, s0 -; CHECK-LV-NEXT: vmov.f32 s3, s5 -; CHECK-LV-NEXT: vmovx.f16 s1, s7 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vins.f16 s1, s7 -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: shuffle3_f16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: vmovx.f16 s5, s3 -; CHECK-LIS-NEXT: vmovx.f16 s6, s1 -; CHECK-LIS-NEXT: vmovx.f16 s4, s0 -; CHECK-LIS-NEXT: vins.f16 s1, s0 -; CHECK-LIS-NEXT: vins.f16 s6, s4 -; CHECK-LIS-NEXT: vins.f16 s5, s3 -; CHECK-LIS-NEXT: vmov.f32 s7, s1 -; CHECK-LIS-NEXT: vmov.f32 s4, s2 -; CHECK-LIS-NEXT: vmov q0, q1 -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: shuffle3_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vins.f16 s5, s4 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s1, s7 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> ret <8 x half> %out @@ -1530,27 +1467,47 @@ entry: ret <2 x double> %out } define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) { -; CHECK-LABEL: shuffle9_f64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s18, s20 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s19, s21 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s11, s13 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle9_f64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vmov q5, q2 +; CHECK-LV-NEXT: vmov.f32 s16, s0 +; CHECK-LV-NEXT: vmov.f32 s18, s20 +; CHECK-LV-NEXT: vmov.f32 s20, s2 +; CHECK-LV-NEXT: vmov.f32 s10, s12 +; CHECK-LV-NEXT: vmov.f32 s19, s21 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s17, s1 +; CHECK-LV-NEXT: vmov.f32 s21, s3 +; CHECK-LV-NEXT: vmov q0, q4 +; CHECK-LV-NEXT: vmov.f32 s12, s6 +; CHECK-LV-NEXT: vmov.f32 s11, s13 +; CHECK-LV-NEXT: vmov.f32 s9, s5 +; CHECK-LV-NEXT: vmov.f32 s13, s7 +; CHECK-LV-NEXT: vmov q1, q5 +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: shuffle9_f64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vmov q5, q2 +; CHECK-LIS-NEXT: vmov q4, q0 +; CHECK-LIS-NEXT: vmov.f32 s2, s20 +; CHECK-LIS-NEXT: vmov.f32 s20, s18 +; CHECK-LIS-NEXT: vmov.f32 s10, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s21 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s21, s19 +; CHECK-LIS-NEXT: vmov.f32 s12, s6 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vmov.f32 s9, s5 +; CHECK-LIS-NEXT: vmov.f32 s13, s7 +; CHECK-LIS-NEXT: vmov q1, q5 +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> ret <8 x double> %out @@ -1623,27 +1580,47 @@ entry: ret <2 x i64> %out } define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) { -; CHECK-LABEL: shuffle9_i64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s18, s20 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s19, s21 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s11, s13 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle9_i64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vmov q5, q2 +; CHECK-LV-NEXT: vmov.f32 s16, s0 +; CHECK-LV-NEXT: vmov.f32 s18, s20 +; CHECK-LV-NEXT: vmov.f32 s20, s2 +; CHECK-LV-NEXT: vmov.f32 s10, s12 +; CHECK-LV-NEXT: vmov.f32 s19, s21 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s17, s1 +; CHECK-LV-NEXT: vmov.f32 s21, s3 +; CHECK-LV-NEXT: vmov q0, q4 +; CHECK-LV-NEXT: vmov.f32 s12, s6 +; CHECK-LV-NEXT: vmov.f32 s11, s13 +; CHECK-LV-NEXT: vmov.f32 s9, s5 +; CHECK-LV-NEXT: vmov.f32 s13, s7 +; CHECK-LV-NEXT: vmov q1, q5 +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: shuffle9_i64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vmov q5, q2 +; CHECK-LIS-NEXT: vmov q4, q0 +; CHECK-LIS-NEXT: vmov.f32 s2, s20 +; CHECK-LIS-NEXT: vmov.f32 s20, s18 +; CHECK-LIS-NEXT: vmov.f32 s10, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s21 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s21, s19 +; CHECK-LIS-NEXT: vmov.f32 s12, s6 +; CHECK-LIS-NEXT: vmov.f32 s11, s13 +; CHECK-LIS-NEXT: vmov.f32 s9, s5 +; CHECK-LIS-NEXT: vmov.f32 s13, s7 +; CHECK-LIS-NEXT: vmov q1, q5 +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> ret <8 x i64> %out diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 8a94e571e9836..ccdc996d75970 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -68,87 +68,46 @@ entry: } define void @vld3_v8i32(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v8i32: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LV-NEXT: vmov.f32 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s0 -; CHECK-LV-NEXT: vmov.f32 s14, s3 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s9, s7 -; CHECK-LV-NEXT: vmov.f32 s12, s5 -; CHECK-LV-NEXT: vmov.f32 s15, s18 -; CHECK-LV-NEXT: vmov.f32 s11, s17 -; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vmov.f32 s2, s16 -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s3, s19 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0] -; CHECK-LV-NEXT: vmov.f32 s17, s4 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vmov.f32 s18, s7 -; CHECK-LV-NEXT: vmov.f32 s22, s6 -; CHECK-LV-NEXT: vmov.f32 s16, s9 -; CHECK-LV-NEXT: vmov.f32 s19, s14 -; CHECK-LV-NEXT: vmov.f32 s20, s8 -; CHECK-LV-NEXT: vmov.f32 s21, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s13 -; CHECK-LV-NEXT: vadd.i32 q4, q5, q4 -; CHECK-LV-NEXT: vmov.f32 s4, s10 -; CHECK-LV-NEXT: vmov.f32 s6, s12 -; CHECK-LV-NEXT: vmov.f32 s7, s15 -; CHECK-LV-NEXT: vadd.i32 q1, q4, q1 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v8i32: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LIS-NEXT: vmov.f32 s10, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s0 -; CHECK-LIS-NEXT: vmov.f32 s14, s3 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s9, s7 -; CHECK-LIS-NEXT: vmov.f32 s12, s5 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s11, s17 -; CHECK-LIS-NEXT: vmov.f32 s0, s6 -; CHECK-LIS-NEXT: vadd.i32 q2, q2, q3 -; CHECK-LIS-NEXT: vmov.f32 s2, s16 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s3, s19 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] -; CHECK-LIS-NEXT: vmov.f32 s13, s4 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s14, s7 -; CHECK-LIS-NEXT: vmov.f32 s22, s6 -; CHECK-LIS-NEXT: vmov.f32 s12, s9 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s8 -; CHECK-LIS-NEXT: vmov.f32 s21, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s17 -; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 -; CHECK-LIS-NEXT: vmov.f32 s4, s10 -; CHECK-LIS-NEXT: vmov.f32 s6, s16 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 -; CHECK-LIS-NEXT: vadd.i32 q1, q3, q1 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v8i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.i32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vadd.i32 q1, q4, q1 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr entry: %l1 = load <24 x i32>, ptr %src, align 4 %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> @@ -161,155 +120,80 @@ entry: } define void @vld3_v16i32(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v16i32: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LV-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-LV-NEXT: vmov.f32 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s0 -; CHECK-LV-NEXT: vmov.f32 s14, s3 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s9, s7 -; CHECK-LV-NEXT: vmov.f32 s12, s5 -; CHECK-LV-NEXT: vmov.f32 s15, s18 -; CHECK-LV-NEXT: vmov.f32 s11, s17 -; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vmov.f32 s2, s16 -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s3, s19 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0] -; CHECK-LV-NEXT: vmov.f32 s17, s4 -; CHECK-LV-NEXT: vmov.f32 s18, s7 -; CHECK-LV-NEXT: vmov.f32 s22, s6 -; CHECK-LV-NEXT: vmov.f32 s16, s9 -; CHECK-LV-NEXT: vmov.f32 s19, s14 -; CHECK-LV-NEXT: vmov.f32 s20, s8 -; CHECK-LV-NEXT: vmov.f32 s21, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s13 -; CHECK-LV-NEXT: vmov.f32 s4, s10 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-LV-NEXT: vmov.f32 s6, s12 -; CHECK-LV-NEXT: vadd.i32 q4, q5, q4 -; CHECK-LV-NEXT: vmov.f32 s7, s15 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-LV-NEXT: vadd.i32 q1, q4, q1 -; CHECK-LV-NEXT: vmov.f32 s18, s10 -; CHECK-LV-NEXT: vmov.f32 s21, s8 -; CHECK-LV-NEXT: vmov.f32 s22, s11 -; CHECK-LV-NEXT: vmov.f32 s16, s12 -; CHECK-LV-NEXT: vmov.f32 s17, s15 -; CHECK-LV-NEXT: vmov.f32 s20, s13 -; CHECK-LV-NEXT: vmov.f32 s23, s26 -; CHECK-LV-NEXT: vmov.f32 s19, s25 -; CHECK-LV-NEXT: vadd.i32 q4, q4, q5 -; CHECK-LV-NEXT: vmov.f32 s8, s14 -; CHECK-LV-NEXT: vmov.f32 s10, s24 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-LV-NEXT: vmov.f32 s11, s27 -; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-LV-NEXT: vadd.i32 q2, q4, q2 -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-LV-NEXT: vmov.f32 s25, s12 -; CHECK-LV-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-LV-NEXT: vmov.f32 s26, s15 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vmov.f32 s30, s14 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vmov.f32 s24, s17 -; CHECK-LV-NEXT: vmov.f32 s27, s22 -; CHECK-LV-NEXT: vmov.f32 s28, s16 -; CHECK-LV-NEXT: vmov.f32 s29, s19 -; CHECK-LV-NEXT: vmov.f32 s31, s21 -; CHECK-LV-NEXT: vadd.i32 q6, q7, q6 -; CHECK-LV-NEXT: vmov.f32 s12, s18 -; CHECK-LV-NEXT: vmov.f32 s14, s20 -; CHECK-LV-NEXT: vmov.f32 s15, s23 -; CHECK-LV-NEXT: vadd.i32 q3, q6, q3 -; CHECK-LV-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v16i32: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-LIS-NEXT: vmov.f32 s10, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s0 -; CHECK-LIS-NEXT: vmov.f32 s14, s3 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s9, s7 -; CHECK-LIS-NEXT: vmov.f32 s12, s5 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s11, s17 -; CHECK-LIS-NEXT: vmov.f32 s0, s6 -; CHECK-LIS-NEXT: vadd.i32 q2, q2, q3 -; CHECK-LIS-NEXT: vmov.f32 s2, s16 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s3, s19 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] -; CHECK-LIS-NEXT: vmov.f32 s13, s4 -; CHECK-LIS-NEXT: vmov.f32 s14, s7 -; CHECK-LIS-NEXT: vmov.f32 s22, s6 -; CHECK-LIS-NEXT: vmov.f32 s12, s9 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s8 -; CHECK-LIS-NEXT: vmov.f32 s21, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s17 -; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 -; CHECK-LIS-NEXT: vmov.f32 s4, s10 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-LIS-NEXT: vmov.f32 s6, s16 -; CHECK-LIS-NEXT: vadd.i32 q1, q3, q1 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-LIS-NEXT: vmov.f32 s18, s10 -; CHECK-LIS-NEXT: vmov.f32 s21, s8 -; CHECK-LIS-NEXT: vmov.f32 s22, s11 -; CHECK-LIS-NEXT: vmov.f32 s16, s12 -; CHECK-LIS-NEXT: vmov.f32 s17, s15 -; CHECK-LIS-NEXT: vmov.f32 s20, s13 -; CHECK-LIS-NEXT: vmov.f32 s23, s26 -; CHECK-LIS-NEXT: vmov.f32 s19, s25 -; CHECK-LIS-NEXT: vmov.f32 s8, s14 -; CHECK-LIS-NEXT: vadd.i32 q4, q4, q5 -; CHECK-LIS-NEXT: vmov.f32 s10, s24 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-LIS-NEXT: vmov.f32 s11, s27 -; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #128] -; CHECK-LIS-NEXT: vadd.i32 q2, q4, q2 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-LIS-NEXT: vmov.f32 s21, s12 -; CHECK-LIS-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-LIS-NEXT: vmov.f32 s22, s15 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s30, s14 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vmov.f32 s20, s17 -; CHECK-LIS-NEXT: vmov.f32 s23, s26 -; CHECK-LIS-NEXT: vmov.f32 s28, s16 -; CHECK-LIS-NEXT: vmov.f32 s29, s19 -; CHECK-LIS-NEXT: vmov.f32 s31, s25 -; CHECK-LIS-NEXT: vadd.i32 q5, q7, q5 -; CHECK-LIS-NEXT: vmov.f32 s12, s18 -; CHECK-LIS-NEXT: vmov.f32 s14, s24 -; CHECK-LIS-NEXT: vmov.f32 s15, s27 -; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 -; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v16i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.i32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-NEXT: vadd.i32 q1, q4, q1 +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vmov.f32 s23, s26 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s11, s27 +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-NEXT: vadd.i32 q2, q4, q2 +; CHECK-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s24, s17 +; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vmov.f32 s31, s21 +; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vadd.i32 q3, q6, q3 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr entry: %l1 = load <48 x i32>, ptr %src, align 4 %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> @@ -364,59 +248,32 @@ entry: } define void @vld3_v4i16(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v4i16: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .save {r4, r5, r6, lr} -; CHECK-LV-NEXT: push {r4, r5, r6, lr} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0] -; CHECK-LV-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.u16 r5, q0[6] -; CHECK-LV-NEXT: vmov.u16 r6, q0[0] -; CHECK-LV-NEXT: vmov r0, r3, d2 -; CHECK-LV-NEXT: vmov.u16 lr, q0[2] -; CHECK-LV-NEXT: vmov r2, r4, d3 -; CHECK-LV-NEXT: vmov q1[2], q1[0], r6, r5 -; CHECK-LV-NEXT: vmov.u16 r5, q0[7] -; CHECK-LV-NEXT: vmov.u16 r6, q0[1] -; CHECK-LV-NEXT: vmov q2[2], q2[0], r6, r5 -; CHECK-LV-NEXT: vmov.u16 r5, q0[3] -; CHECK-LV-NEXT: vmov.u16 r6, q0[4] -; CHECK-LV-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-LV-NEXT: vmov q2[3], q2[1], r6, r2 -; CHECK-LV-NEXT: vmov.u16 r12, q0[5] -; CHECK-LV-NEXT: vadd.i32 q0, q1, q2 -; CHECK-LV-NEXT: vmov q1[2], q1[0], lr, r0 -; CHECK-LV-NEXT: vmov q1[3], q1[1], r12, r4 -; CHECK-LV-NEXT: vadd.i32 q0, q0, q1 -; CHECK-LV-NEXT: vstrh.32 q0, [r1] -; CHECK-LV-NEXT: pop {r4, r5, r6, pc} -; -; CHECK-LIS-LABEL: vld3_v4i16: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .save {r4, r5, r6, lr} -; CHECK-LIS-NEXT: push {r4, r5, r6, lr} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] -; CHECK-LIS-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.u16 r5, q0[6] -; CHECK-LIS-NEXT: vmov.u16 r6, q0[0] -; CHECK-LIS-NEXT: vmov r0, r2, d2 -; CHECK-LIS-NEXT: vmov.u16 r12, q0[2] -; CHECK-LIS-NEXT: vmov r3, r4, d3 -; CHECK-LIS-NEXT: vmov q1[2], q1[0], r6, r5 -; CHECK-LIS-NEXT: vmov.u16 r5, q0[7] -; CHECK-LIS-NEXT: vmov.u16 r6, q0[1] -; CHECK-LIS-NEXT: vmov q2[2], q2[0], r6, r5 -; CHECK-LIS-NEXT: vmov.u16 r5, q0[3] -; CHECK-LIS-NEXT: vmov.u16 r6, q0[4] -; CHECK-LIS-NEXT: vmov q1[3], q1[1], r5, r2 -; CHECK-LIS-NEXT: vmov q2[3], q2[1], r6, r3 -; CHECK-LIS-NEXT: vmov.u16 lr, q0[5] -; CHECK-LIS-NEXT: vadd.i32 q0, q1, q2 -; CHECK-LIS-NEXT: vmov q1[2], q1[0], r12, r0 -; CHECK-LIS-NEXT: vmov q1[3], q1[1], lr, r4 -; CHECK-LIS-NEXT: vadd.i32 q0, q0, q1 -; CHECK-LIS-NEXT: vstrh.32 q0, [r1] -; CHECK-LIS-NEXT: pop {r4, r5, r6, pc} +; CHECK-LABEL: vld3_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrh.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.u16 r5, q0[6] +; CHECK-NEXT: vmov.u16 r6, q0[0] +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: vmov.u16 lr, q0[2] +; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r5 +; CHECK-NEXT: vmov.u16 r5, q0[7] +; CHECK-NEXT: vmov.u16 r6, q0[1] +; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 +; CHECK-NEXT: vmov.u16 r5, q0[3] +; CHECK-NEXT: vmov.u16 r6, q0[4] +; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 +; CHECK-NEXT: vmov.u16 r12, q0[5] +; CHECK-NEXT: vadd.i32 q0, q1, q2 +; CHECK-NEXT: vmov q1[2], q1[0], lr, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <12 x i16>, ptr %src, align 4 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> @@ -603,44 +460,44 @@ define void @vld3_v16i16(ptr %src, ptr %dst) { ; CHECK-LIS-NEXT: vmov.f32 s3, s13 ; CHECK-LIS-NEXT: vins.f16 s17, s9 ; CHECK-LIS-NEXT: vmov.f32 s2, s10 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-LIS-NEXT: vadd.i16 q0, q0, q4 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-LIS-NEXT: vadd.i16 q0, q0, q1 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-LIS-NEXT: vmovx.f16 s10, s14 -; CHECK-LIS-NEXT: vmov.f32 s22, s15 -; CHECK-LIS-NEXT: vins.f16 s10, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s4 -; CHECK-LIS-NEXT: vins.f16 s22, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s7 -; CHECK-LIS-NEXT: vmov.f32 s23, s6 -; CHECK-LIS-NEXT: vmovx.f16 s8, s16 -; CHECK-LIS-NEXT: vins.f16 s23, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s17 -; CHECK-LIS-NEXT: vins.f16 s16, s4 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0] +; CHECK-LIS-NEXT: vmovx.f16 s6, s18 +; CHECK-LIS-NEXT: vmov.f32 s22, s19 +; CHECK-LIS-NEXT: vins.f16 s6, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s8 +; CHECK-LIS-NEXT: vins.f16 s22, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s10 ; CHECK-LIS-NEXT: vmovx.f16 s4, s12 -; CHECK-LIS-NEXT: vmovx.f16 s9, s19 -; CHECK-LIS-NEXT: vins.f16 s19, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s15 -; CHECK-LIS-NEXT: vmovx.f16 s11, s5 -; CHECK-LIS-NEXT: vins.f16 s14, s4 -; CHECK-LIS-NEXT: vmovx.f16 s4, s6 -; CHECK-LIS-NEXT: vins.f16 s8, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s17 -; CHECK-LIS-NEXT: vmovx.f16 s18, s18 -; CHECK-LIS-NEXT: vins.f16 s5, s4 -; CHECK-LIS-NEXT: vins.f16 s9, s13 -; CHECK-LIS-NEXT: vins.f16 s20, s18 -; CHECK-LIS-NEXT: vmov.f32 s17, s19 -; CHECK-LIS-NEXT: vins.f16 s11, s7 -; CHECK-LIS-NEXT: vmovx.f16 s13, s13 -; CHECK-LIS-NEXT: vmov.f32 s21, s12 -; CHECK-LIS-NEXT: vmov.f32 s18, s14 -; CHECK-LIS-NEXT: vins.f16 s21, s13 -; CHECK-LIS-NEXT: vmov.f32 s19, s5 +; CHECK-LIS-NEXT: vins.f16 s23, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s13 +; CHECK-LIS-NEXT: vins.f16 s12, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s16 +; CHECK-LIS-NEXT: vmovx.f16 s5, s15 +; CHECK-LIS-NEXT: vins.f16 s15, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s19 +; CHECK-LIS-NEXT: vins.f16 s4, s14 +; CHECK-LIS-NEXT: vmov.f32 s20, s13 +; CHECK-LIS-NEXT: vmovx.f16 s14, s14 +; CHECK-LIS-NEXT: vins.f16 s18, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s10 +; CHECK-LIS-NEXT: vmovx.f16 s7, s9 +; CHECK-LIS-NEXT: vins.f16 s20, s14 +; CHECK-LIS-NEXT: vmovx.f16 s14, s17 +; CHECK-LIS-NEXT: vmov.f32 s21, s16 +; CHECK-LIS-NEXT: vins.f16 s9, s8 +; CHECK-LIS-NEXT: vins.f16 s21, s14 +; CHECK-LIS-NEXT: vmov.f32 s13, s15 +; CHECK-LIS-NEXT: vins.f16 s7, s11 +; CHECK-LIS-NEXT: vins.f16 s5, s17 +; CHECK-LIS-NEXT: vmov.f32 s14, s18 +; CHECK-LIS-NEXT: vmov.f32 s15, s9 ; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vadd.i16 q1, q4, q2 +; CHECK-LIS-NEXT: vadd.i16 q1, q3, q1 ; CHECK-LIS-NEXT: vadd.i16 q1, q1, q5 ; CHECK-LIS-NEXT: vstrw.32 q1, [r1] ; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} @@ -935,65 +792,35 @@ entry: ; i64 define void @vld3_v2i64(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v2i64: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s12, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s3 -; CHECK-LV-NEXT: vmov.f32 s2, s4 -; CHECK-LV-NEXT: vmov.f32 s3, s5 -; CHECK-LV-NEXT: vmov r0, r3, d5 -; CHECK-LV-NEXT: vmov r2, r4, d3 -; CHECK-LV-NEXT: vmov r6, r7, d0 -; CHECK-LV-NEXT: vmov r5, r8, d6 -; CHECK-LV-NEXT: vmov lr, r12, d1 -; CHECK-LV-NEXT: adds.w r0, r0, lr -; CHECK-LV-NEXT: adc.w r3, r3, r12 -; CHECK-LV-NEXT: adds r0, r0, r2 -; CHECK-LV-NEXT: adc.w r2, r3, r4 -; CHECK-LV-NEXT: vmov r3, r4, d4 -; CHECK-LV-NEXT: adds r6, r6, r5 -; CHECK-LV-NEXT: adc.w r7, r7, r8 -; CHECK-LV-NEXT: adds r3, r3, r6 -; CHECK-LV-NEXT: adcs r7, r4 -; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-LV-NEXT: vmov q0[3], q0[1], r7, r2 -; CHECK-LV-NEXT: vstrw.32 q0, [r1] -; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} -; -; CHECK-LIS-LABEL: vld3_v2i64: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s12, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s3 -; CHECK-LIS-NEXT: vmov.f32 s2, s8 -; CHECK-LIS-NEXT: vmov.f32 s3, s9 -; CHECK-LIS-NEXT: vmov r0, r3, d3 -; CHECK-LIS-NEXT: vmov r2, r4, d5 -; CHECK-LIS-NEXT: vmov r6, r7, d0 -; CHECK-LIS-NEXT: vmov r5, r8, d6 -; CHECK-LIS-NEXT: vmov lr, r12, d1 -; CHECK-LIS-NEXT: adds.w r0, r0, lr -; CHECK-LIS-NEXT: adc.w r3, r3, r12 -; CHECK-LIS-NEXT: adds r0, r0, r2 -; CHECK-LIS-NEXT: adc.w r2, r3, r4 -; CHECK-LIS-NEXT: vmov r3, r4, d2 -; CHECK-LIS-NEXT: adds r6, r6, r5 -; CHECK-LIS-NEXT: adc.w r7, r7, r8 -; CHECK-LIS-NEXT: adds r3, r3, r6 -; CHECK-LIS-NEXT: adcs r7, r4 -; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-LIS-NEXT: vmov q0[3], q0[1], r7, r2 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1] -; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-LABEL: vld3_v2i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov r0, r3, d5 +; CHECK-NEXT: vmov r2, r4, d3 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: vmov r5, r8, d6 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r2, r3, r4 +; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: adds r6, r6, r5 +; CHECK-NEXT: adc.w r7, r7, r8 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adcs r7, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <6 x i64>, ptr %src, align 4 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> @@ -1071,51 +898,50 @@ define void @vld3_v4i64(ptr %src, ptr %dst) { ; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12} ; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] ; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-LIS-NEXT: vmov.f32 s8, s2 +; CHECK-LIS-NEXT: vmov.f32 s4, s2 ; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-LIS-NEXT: vmov.f32 s9, s3 +; CHECK-LIS-NEXT: vmov.f32 s5, s3 ; CHECK-LIS-NEXT: vmov.f32 s2, s12 ; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vmov r2, r3, d3 -; CHECK-LIS-NEXT: vmov r4, r8, d7 +; CHECK-LIS-NEXT: vmov r5, r4, d5 +; CHECK-LIS-NEXT: vmov r3, r8, d7 ; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] ; CHECK-LIS-NEXT: vmov.f32 s24, s22 ; CHECK-LIS-NEXT: vmov.f32 s25, s23 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 ; CHECK-LIS-NEXT: vmov lr, r12, d1 ; CHECK-LIS-NEXT: vmov.f32 s2, s12 ; CHECK-LIS-NEXT: vmov.f32 s3, s13 -; CHECK-LIS-NEXT: vmov r6, r7, d12 -; CHECK-LIS-NEXT: adds.w r0, r2, lr -; CHECK-LIS-NEXT: adc.w r2, r3, r12 -; CHECK-LIS-NEXT: adds.w lr, r0, r4 -; CHECK-LIS-NEXT: vmov r3, r5, d10 -; CHECK-LIS-NEXT: adc.w r12, r2, r8 -; CHECK-LIS-NEXT: vmov r2, r0, d8 -; CHECK-LIS-NEXT: adds r3, r3, r6 -; CHECK-LIS-NEXT: adcs r7, r5 -; CHECK-LIS-NEXT: adds r2, r2, r3 -; CHECK-LIS-NEXT: adc.w r8, r7, r0 -; CHECK-LIS-NEXT: vmov r6, r5, d1 -; CHECK-LIS-NEXT: vmov r3, r7, d3 -; CHECK-LIS-NEXT: vmov r4, r0, d0 -; CHECK-LIS-NEXT: adds r3, r3, r6 -; CHECK-LIS-NEXT: adcs r7, r5 -; CHECK-LIS-NEXT: vmov r6, r5, d7 -; CHECK-LIS-NEXT: adds r3, r3, r6 -; CHECK-LIS-NEXT: adcs r7, r5 -; CHECK-LIS-NEXT: vmov r6, r5, d4 -; CHECK-LIS-NEXT: adds r4, r4, r6 -; CHECK-LIS-NEXT: adcs r0, r5 -; CHECK-LIS-NEXT: vmov r5, r6, d2 -; CHECK-LIS-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r7 +; CHECK-LIS-NEXT: vmov r7, r6, d12 +; CHECK-LIS-NEXT: adds.w r0, r5, lr +; CHECK-LIS-NEXT: adc.w r5, r4, r12 +; CHECK-LIS-NEXT: adds.w lr, r0, r3 +; CHECK-LIS-NEXT: vmov r4, r2, d10 +; CHECK-LIS-NEXT: adc.w r12, r5, r8 +; CHECK-LIS-NEXT: vmov r5, r0, d8 +; CHECK-LIS-NEXT: adds r7, r7, r4 +; CHECK-LIS-NEXT: adcs r2, r6 +; CHECK-LIS-NEXT: adds r7, r7, r5 +; CHECK-LIS-NEXT: adc.w r8, r2, r0 +; CHECK-LIS-NEXT: vmov r6, r4, d1 +; CHECK-LIS-NEXT: vmov r2, r5, d9 +; CHECK-LIS-NEXT: vmov r3, r0, d0 +; CHECK-LIS-NEXT: adds r2, r2, r6 +; CHECK-LIS-NEXT: adc.w r6, r5, r4 +; CHECK-LIS-NEXT: vmov r5, r4, d7 +; CHECK-LIS-NEXT: adds r2, r2, r5 +; CHECK-LIS-NEXT: adcs r6, r4 +; CHECK-LIS-NEXT: vmov r5, r4, d2 +; CHECK-LIS-NEXT: vmov q1[2], q1[0], r7, r2 +; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r6 ; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-LIS-NEXT: adds r4, r4, r5 -; CHECK-LIS-NEXT: vmov q0[2], q0[0], r4, lr -; CHECK-LIS-NEXT: adcs r0, r6 +; CHECK-LIS-NEXT: adds r3, r3, r5 +; CHECK-LIS-NEXT: adcs r0, r4 +; CHECK-LIS-NEXT: vmov r4, r5, d4 +; CHECK-LIS-NEXT: adds r3, r3, r4 +; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-LIS-NEXT: adcs r0, r5 ; CHECK-LIS-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-LIS-NEXT: vstrw.32 q0, [r1] ; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12} @@ -1194,87 +1020,46 @@ entry: } define void @vld3_v8f32(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v8f32: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LV-NEXT: vmov.f32 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s0 -; CHECK-LV-NEXT: vmov.f32 s14, s3 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s9, s7 -; CHECK-LV-NEXT: vmov.f32 s12, s5 -; CHECK-LV-NEXT: vmov.f32 s15, s18 -; CHECK-LV-NEXT: vmov.f32 s11, s17 -; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vmov.f32 s2, s16 -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s3, s19 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0] -; CHECK-LV-NEXT: vmov.f32 s17, s4 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vmov.f32 s18, s7 -; CHECK-LV-NEXT: vmov.f32 s22, s6 -; CHECK-LV-NEXT: vmov.f32 s16, s9 -; CHECK-LV-NEXT: vmov.f32 s19, s14 -; CHECK-LV-NEXT: vmov.f32 s20, s8 -; CHECK-LV-NEXT: vmov.f32 s21, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s13 -; CHECK-LV-NEXT: vadd.f32 q4, q5, q4 -; CHECK-LV-NEXT: vmov.f32 s4, s10 -; CHECK-LV-NEXT: vmov.f32 s6, s12 -; CHECK-LV-NEXT: vmov.f32 s7, s15 -; CHECK-LV-NEXT: vadd.f32 q1, q4, q1 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v8f32: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LIS-NEXT: vmov.f32 s10, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s0 -; CHECK-LIS-NEXT: vmov.f32 s14, s3 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s9, s7 -; CHECK-LIS-NEXT: vmov.f32 s12, s5 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s11, s17 -; CHECK-LIS-NEXT: vmov.f32 s0, s6 -; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 -; CHECK-LIS-NEXT: vmov.f32 s2, s16 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s3, s19 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] -; CHECK-LIS-NEXT: vmov.f32 s13, s4 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s14, s7 -; CHECK-LIS-NEXT: vmov.f32 s22, s6 -; CHECK-LIS-NEXT: vmov.f32 s12, s9 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s8 -; CHECK-LIS-NEXT: vmov.f32 s21, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s17 -; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 -; CHECK-LIS-NEXT: vmov.f32 s4, s10 -; CHECK-LIS-NEXT: vmov.f32 s6, s16 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 -; CHECK-LIS-NEXT: vadd.f32 q1, q3, q1 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.f32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vadd.f32 q1, q4, q1 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr entry: %l1 = load <24 x float>, ptr %src, align 4 %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> @@ -1287,155 +1072,80 @@ entry: } define void @vld3_v16f32(ptr %src, ptr %dst) { -; CHECK-LV-LABEL: vld3_v16f32: -; CHECK-LV: @ %bb.0: @ %entry -; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LV-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-LV-NEXT: vmov.f32 s10, s2 -; CHECK-LV-NEXT: vmov.f32 s13, s0 -; CHECK-LV-NEXT: vmov.f32 s14, s3 -; CHECK-LV-NEXT: vmov.f32 s8, s4 -; CHECK-LV-NEXT: vmov.f32 s9, s7 -; CHECK-LV-NEXT: vmov.f32 s12, s5 -; CHECK-LV-NEXT: vmov.f32 s15, s18 -; CHECK-LV-NEXT: vmov.f32 s11, s17 -; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 -; CHECK-LV-NEXT: vmov.f32 s0, s6 -; CHECK-LV-NEXT: vmov.f32 s2, s16 -; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LV-NEXT: vmov.f32 s3, s19 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0] -; CHECK-LV-NEXT: vmov.f32 s17, s4 -; CHECK-LV-NEXT: vmov.f32 s18, s7 -; CHECK-LV-NEXT: vmov.f32 s22, s6 -; CHECK-LV-NEXT: vmov.f32 s16, s9 -; CHECK-LV-NEXT: vmov.f32 s19, s14 -; CHECK-LV-NEXT: vmov.f32 s20, s8 -; CHECK-LV-NEXT: vmov.f32 s21, s11 -; CHECK-LV-NEXT: vmov.f32 s23, s13 -; CHECK-LV-NEXT: vmov.f32 s4, s10 -; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-LV-NEXT: vmov.f32 s6, s12 -; CHECK-LV-NEXT: vadd.f32 q4, q5, q4 -; CHECK-LV-NEXT: vmov.f32 s7, s15 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-LV-NEXT: vadd.f32 q1, q4, q1 -; CHECK-LV-NEXT: vmov.f32 s18, s10 -; CHECK-LV-NEXT: vmov.f32 s21, s8 -; CHECK-LV-NEXT: vmov.f32 s22, s11 -; CHECK-LV-NEXT: vmov.f32 s16, s12 -; CHECK-LV-NEXT: vmov.f32 s17, s15 -; CHECK-LV-NEXT: vmov.f32 s20, s13 -; CHECK-LV-NEXT: vmov.f32 s23, s26 -; CHECK-LV-NEXT: vmov.f32 s19, s25 -; CHECK-LV-NEXT: vadd.f32 q4, q4, q5 -; CHECK-LV-NEXT: vmov.f32 s8, s14 -; CHECK-LV-NEXT: vmov.f32 s10, s24 -; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-LV-NEXT: vmov.f32 s11, s27 -; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-LV-NEXT: vadd.f32 q2, q4, q2 -; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-LV-NEXT: vmov.f32 s25, s12 -; CHECK-LV-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-LV-NEXT: vmov.f32 s26, s15 -; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LV-NEXT: vmov.f32 s30, s14 -; CHECK-LV-NEXT: vstrw.32 q1, [r1] -; CHECK-LV-NEXT: vmov.f32 s24, s17 -; CHECK-LV-NEXT: vmov.f32 s27, s22 -; CHECK-LV-NEXT: vmov.f32 s28, s16 -; CHECK-LV-NEXT: vmov.f32 s29, s19 -; CHECK-LV-NEXT: vmov.f32 s31, s21 -; CHECK-LV-NEXT: vadd.f32 q6, q7, q6 -; CHECK-LV-NEXT: vmov.f32 s12, s18 -; CHECK-LV-NEXT: vmov.f32 s14, s20 -; CHECK-LV-NEXT: vmov.f32 s15, s23 -; CHECK-LV-NEXT: vadd.f32 q3, q6, q3 -; CHECK-LV-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LV-NEXT: bx lr -; -; CHECK-LIS-LABEL: vld3_v16f32: -; CHECK-LIS: @ %bb.0: @ %entry -; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-LIS-NEXT: vmov.f32 s10, s2 -; CHECK-LIS-NEXT: vmov.f32 s13, s0 -; CHECK-LIS-NEXT: vmov.f32 s14, s3 -; CHECK-LIS-NEXT: vmov.f32 s8, s4 -; CHECK-LIS-NEXT: vmov.f32 s9, s7 -; CHECK-LIS-NEXT: vmov.f32 s12, s5 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s11, s17 -; CHECK-LIS-NEXT: vmov.f32 s0, s6 -; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 -; CHECK-LIS-NEXT: vmov.f32 s2, s16 -; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-LIS-NEXT: vmov.f32 s3, s19 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] -; CHECK-LIS-NEXT: vmov.f32 s13, s4 -; CHECK-LIS-NEXT: vmov.f32 s14, s7 -; CHECK-LIS-NEXT: vmov.f32 s22, s6 -; CHECK-LIS-NEXT: vmov.f32 s12, s9 -; CHECK-LIS-NEXT: vmov.f32 s15, s18 -; CHECK-LIS-NEXT: vmov.f32 s20, s8 -; CHECK-LIS-NEXT: vmov.f32 s21, s11 -; CHECK-LIS-NEXT: vmov.f32 s23, s17 -; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 -; CHECK-LIS-NEXT: vmov.f32 s4, s10 -; CHECK-LIS-NEXT: vmov.f32 s7, s19 -; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-LIS-NEXT: vmov.f32 s6, s16 -; CHECK-LIS-NEXT: vadd.f32 q1, q3, q1 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-LIS-NEXT: vmov.f32 s18, s10 -; CHECK-LIS-NEXT: vmov.f32 s21, s8 -; CHECK-LIS-NEXT: vmov.f32 s22, s11 -; CHECK-LIS-NEXT: vmov.f32 s16, s12 -; CHECK-LIS-NEXT: vmov.f32 s17, s15 -; CHECK-LIS-NEXT: vmov.f32 s20, s13 -; CHECK-LIS-NEXT: vmov.f32 s23, s26 -; CHECK-LIS-NEXT: vmov.f32 s19, s25 -; CHECK-LIS-NEXT: vmov.f32 s8, s14 -; CHECK-LIS-NEXT: vadd.f32 q4, q4, q5 -; CHECK-LIS-NEXT: vmov.f32 s10, s24 -; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-LIS-NEXT: vmov.f32 s11, s27 -; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #128] -; CHECK-LIS-NEXT: vadd.f32 q2, q4, q2 -; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-LIS-NEXT: vmov.f32 s21, s12 -; CHECK-LIS-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-LIS-NEXT: vmov.f32 s22, s15 -; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-LIS-NEXT: vmov.f32 s30, s14 -; CHECK-LIS-NEXT: vstrw.32 q1, [r1] -; CHECK-LIS-NEXT: vmov.f32 s20, s17 -; CHECK-LIS-NEXT: vmov.f32 s23, s26 -; CHECK-LIS-NEXT: vmov.f32 s28, s16 -; CHECK-LIS-NEXT: vmov.f32 s29, s19 -; CHECK-LIS-NEXT: vmov.f32 s31, s25 -; CHECK-LIS-NEXT: vadd.f32 q5, q7, q5 -; CHECK-LIS-NEXT: vmov.f32 s12, s18 -; CHECK-LIS-NEXT: vmov.f32 s14, s24 -; CHECK-LIS-NEXT: vmov.f32 s15, s27 -; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 -; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-LIS-NEXT: bx lr +; CHECK-LABEL: vld3_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vadd.f32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-NEXT: vadd.f32 q1, q4, q1 +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vmov.f32 s23, s26 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vadd.f32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vmov.f32 s11, s27 +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-NEXT: vadd.f32 q2, q4, q2 +; CHECK-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-NEXT: vmov.f32 s25, s12 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s24, s17 +; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vmov.f32 s31, s21 +; CHECK-NEXT: vadd.f32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vadd.f32 q3, q6, q3 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr entry: %l1 = load <48 x float>, ptr %src, align 4 %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> @@ -1518,49 +1228,93 @@ entry: } define void @vld3_v8f16(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v8f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s3, s8 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s10, s8 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vmovx.f16 s19, s13 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vmovx.f16 s7, s12 -; CHECK-NEXT: vins.f16 s18, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vins.f16 s13, s8 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vins.f16 s14, s2 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s19, s15 -; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vmov.f32 s7, s14 -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v8f16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9} +; CHECK-LV-NEXT: vpush {d8, d9} +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vmov.f32 s5, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s8 +; CHECK-LV-NEXT: vmovx.f16 s17, s3 +; CHECK-LV-NEXT: vins.f16 s3, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s11 +; CHECK-LV-NEXT: vmovx.f16 s18, s10 +; CHECK-LV-NEXT: vmovx.f16 s16, s0 +; CHECK-LV-NEXT: vins.f16 s10, s8 +; CHECK-LV-NEXT: vmovx.f16 s6, s2 +; CHECK-LV-NEXT: vmov.f32 s4, s1 +; CHECK-LV-NEXT: vmovx.f16 s8, s14 +; CHECK-LV-NEXT: vmovx.f16 s19, s13 +; CHECK-LV-NEXT: vins.f16 s4, s6 +; CHECK-LV-NEXT: vmovx.f16 s6, s9 +; CHECK-LV-NEXT: vins.f16 s16, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s15 +; CHECK-LV-NEXT: vmovx.f16 s7, s12 +; CHECK-LV-NEXT: vins.f16 s18, s12 +; CHECK-LV-NEXT: vmovx.f16 s12, s1 +; CHECK-LV-NEXT: vins.f16 s13, s8 +; CHECK-LV-NEXT: vins.f16 s5, s6 +; CHECK-LV-NEXT: vmov.f32 s6, s11 +; CHECK-LV-NEXT: vins.f16 s14, s2 +; CHECK-LV-NEXT: vmov.f32 s1, s3 +; CHECK-LV-NEXT: vins.f16 s19, s15 +; CHECK-LV-NEXT: vins.f16 s17, s9 +; CHECK-LV-NEXT: vins.f16 s0, s12 +; CHECK-LV-NEXT: vmov.f32 s2, s10 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vins.f16 s6, s7 +; CHECK-LV-NEXT: vmov.f32 s7, s14 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q1 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v8f16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9} +; CHECK-LIS-NEXT: vpush {d8, d9} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LIS-NEXT: vmov.f32 s4, s1 +; CHECK-LIS-NEXT: vmovx.f16 s6, s2 +; CHECK-LIS-NEXT: vins.f16 s4, s6 +; CHECK-LIS-NEXT: vmov.f32 s5, s8 +; CHECK-LIS-NEXT: vmovx.f16 s6, s9 +; CHECK-LIS-NEXT: vmovx.f16 s8, s8 +; CHECK-LIS-NEXT: vmovx.f16 s13, s3 +; CHECK-LIS-NEXT: vins.f16 s5, s6 +; CHECK-LIS-NEXT: vins.f16 s3, s8 +; CHECK-LIS-NEXT: vmov.f32 s6, s11 +; CHECK-LIS-NEXT: vmovx.f16 s12, s16 +; CHECK-LIS-NEXT: vmovx.f16 s8, s11 +; CHECK-LIS-NEXT: vmovx.f16 s14, s10 +; CHECK-LIS-NEXT: vins.f16 s6, s12 +; CHECK-LIS-NEXT: vmovx.f16 s12, s0 +; CHECK-LIS-NEXT: vins.f16 s10, s8 +; CHECK-LIS-NEXT: vmovx.f16 s8, s18 +; CHECK-LIS-NEXT: vmovx.f16 s15, s17 +; CHECK-LIS-NEXT: vins.f16 s12, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s19 +; CHECK-LIS-NEXT: vmovx.f16 s1, s1 +; CHECK-LIS-NEXT: vins.f16 s17, s8 +; CHECK-LIS-NEXT: vins.f16 s18, s2 +; CHECK-LIS-NEXT: vins.f16 s0, s1 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s14, s16 +; CHECK-LIS-NEXT: vins.f16 s15, s19 +; CHECK-LIS-NEXT: vins.f16 s13, s9 +; CHECK-LIS-NEXT: vmov.f32 s2, s10 +; CHECK-LIS-NEXT: vmov.f32 s3, s17 +; CHECK-LIS-NEXT: vmov.f32 s7, s18 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q3 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q1 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <24 x half>, ptr %src, align 4 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> @@ -1573,86 +1327,167 @@ entry: } define void @vld3_v16f16(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v16f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmovx.f16 s7, s12 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vmovx.f16 s16, s15 -; CHECK-NEXT: vmov.f32 s7, s14 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s7, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vins.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s2, s11 -; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vmovx.f16 s2, s14 -; CHECK-NEXT: vmovx.f16 s19, s13 -; CHECK-NEXT: vins.f16 s13, s2 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s18, s12 -; CHECK-NEXT: vins.f16 s19, s15 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vadd.f16 q2, q0, q1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s13 -; CHECK-NEXT: vmov.f32 s9, s12 -; CHECK-NEXT: vmovx.f16 s11, s4 -; CHECK-NEXT: vins.f16 s9, s10 -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s11, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vins.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vmovx.f16 s18, s14 -; CHECK-NEXT: vins.f16 s14, s2 -; CHECK-NEXT: vmovx.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s19, s5 -; CHECK-NEXT: vins.f16 s5, s2 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vins.f16 s19, s7 -; CHECK-NEXT: vins.f16 s17, s13 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vadd.f16 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v16f16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9} +; CHECK-LV-NEXT: vpush {d8, d9} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LV-NEXT: vmovx.f16 s6, s2 +; CHECK-LV-NEXT: vmov.f32 s4, s1 +; CHECK-LV-NEXT: vins.f16 s4, s6 +; CHECK-LV-NEXT: vmovx.f16 s6, s9 +; CHECK-LV-NEXT: vmov.f32 s5, s8 +; CHECK-LV-NEXT: vmovx.f16 s7, s12 +; CHECK-LV-NEXT: vins.f16 s5, s6 +; CHECK-LV-NEXT: vmov.f32 s6, s11 +; CHECK-LV-NEXT: vins.f16 s6, s7 +; CHECK-LV-NEXT: vmovx.f16 s16, s15 +; CHECK-LV-NEXT: vmov.f32 s7, s14 +; CHECK-LV-NEXT: vmovx.f16 s17, s3 +; CHECK-LV-NEXT: vins.f16 s7, s16 +; CHECK-LV-NEXT: vmovx.f16 s16, s0 +; CHECK-LV-NEXT: vins.f16 s16, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s1 +; CHECK-LV-NEXT: vins.f16 s0, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s8 +; CHECK-LV-NEXT: vins.f16 s3, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s11 +; CHECK-LV-NEXT: vmovx.f16 s18, s10 +; CHECK-LV-NEXT: vins.f16 s10, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s14 +; CHECK-LV-NEXT: vmovx.f16 s19, s13 +; CHECK-LV-NEXT: vins.f16 s13, s2 +; CHECK-LV-NEXT: vmov.f32 s1, s3 +; CHECK-LV-NEXT: vins.f16 s18, s12 +; CHECK-LV-NEXT: vins.f16 s19, s15 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vins.f16 s17, s9 +; CHECK-LV-NEXT: vmov.f32 s2, s10 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LV-NEXT: vadd.f16 q2, q0, q1 +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LV-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-LV-NEXT: vmovx.f16 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s8, s1 +; CHECK-LV-NEXT: vins.f16 s8, s10 +; CHECK-LV-NEXT: vmovx.f16 s10, s13 +; CHECK-LV-NEXT: vmov.f32 s9, s12 +; CHECK-LV-NEXT: vmovx.f16 s11, s4 +; CHECK-LV-NEXT: vins.f16 s9, s10 +; CHECK-LV-NEXT: vmov.f32 s10, s15 +; CHECK-LV-NEXT: vins.f16 s10, s11 +; CHECK-LV-NEXT: vmovx.f16 s16, s7 +; CHECK-LV-NEXT: vmov.f32 s11, s6 +; CHECK-LV-NEXT: vmovx.f16 s17, s3 +; CHECK-LV-NEXT: vins.f16 s11, s16 +; CHECK-LV-NEXT: vmovx.f16 s16, s0 +; CHECK-LV-NEXT: vins.f16 s16, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s1 +; CHECK-LV-NEXT: vins.f16 s0, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s12 +; CHECK-LV-NEXT: vins.f16 s3, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s15 +; CHECK-LV-NEXT: vmovx.f16 s18, s14 +; CHECK-LV-NEXT: vins.f16 s14, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s6 +; CHECK-LV-NEXT: vmovx.f16 s19, s5 +; CHECK-LV-NEXT: vins.f16 s5, s2 +; CHECK-LV-NEXT: vmov.f32 s1, s3 +; CHECK-LV-NEXT: vins.f16 s18, s4 +; CHECK-LV-NEXT: vins.f16 s19, s7 +; CHECK-LV-NEXT: vins.f16 s17, s13 +; CHECK-LV-NEXT: vmov.f32 s2, s14 +; CHECK-LV-NEXT: vmov.f32 s3, s5 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LV-NEXT: vadd.f16 q0, q0, q2 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v16f16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9} +; CHECK-LIS-NEXT: vpush {d8, d9} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LIS-NEXT: vmovx.f16 s6, s2 +; CHECK-LIS-NEXT: vmov.f32 s4, s1 +; CHECK-LIS-NEXT: vins.f16 s4, s6 +; CHECK-LIS-NEXT: vmovx.f16 s6, s9 +; CHECK-LIS-NEXT: vmov.f32 s5, s8 +; CHECK-LIS-NEXT: vmovx.f16 s7, s12 +; CHECK-LIS-NEXT: vins.f16 s5, s6 +; CHECK-LIS-NEXT: vmov.f32 s6, s11 +; CHECK-LIS-NEXT: vins.f16 s6, s7 +; CHECK-LIS-NEXT: vmovx.f16 s16, s15 +; CHECK-LIS-NEXT: vmov.f32 s7, s14 +; CHECK-LIS-NEXT: vmovx.f16 s17, s3 +; CHECK-LIS-NEXT: vins.f16 s7, s16 +; CHECK-LIS-NEXT: vmovx.f16 s16, s0 +; CHECK-LIS-NEXT: vins.f16 s16, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s1 +; CHECK-LIS-NEXT: vins.f16 s0, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s8 +; CHECK-LIS-NEXT: vins.f16 s3, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s11 +; CHECK-LIS-NEXT: vmovx.f16 s18, s10 +; CHECK-LIS-NEXT: vins.f16 s10, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s14 +; CHECK-LIS-NEXT: vmovx.f16 s19, s13 +; CHECK-LIS-NEXT: vins.f16 s13, s2 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s18, s12 +; CHECK-LIS-NEXT: vins.f16 s19, s15 +; CHECK-LIS-NEXT: vmov.f32 s3, s13 +; CHECK-LIS-NEXT: vins.f16 s17, s9 +; CHECK-LIS-NEXT: vmov.f32 s2, s10 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-LIS-NEXT: vadd.f16 q1, q0, q1 +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s5, s12 +; CHECK-LIS-NEXT: vmovx.f16 s6, s2 +; CHECK-LIS-NEXT: vmov.f32 s4, s1 +; CHECK-LIS-NEXT: vins.f16 s4, s6 +; CHECK-LIS-NEXT: vmovx.f16 s6, s13 +; CHECK-LIS-NEXT: vins.f16 s5, s6 +; CHECK-LIS-NEXT: vmov.f32 s6, s15 +; CHECK-LIS-NEXT: vmovx.f16 s7, s8 +; CHECK-LIS-NEXT: vmovx.f16 s16, s11 +; CHECK-LIS-NEXT: vins.f16 s6, s7 +; CHECK-LIS-NEXT: vmov.f32 s7, s10 +; CHECK-LIS-NEXT: vins.f16 s7, s16 +; CHECK-LIS-NEXT: vmovx.f16 s16, s0 +; CHECK-LIS-NEXT: vins.f16 s16, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s1 +; CHECK-LIS-NEXT: vins.f16 s0, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s12 +; CHECK-LIS-NEXT: vmovx.f16 s17, s3 +; CHECK-LIS-NEXT: vins.f16 s3, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s15 +; CHECK-LIS-NEXT: vmovx.f16 s18, s14 +; CHECK-LIS-NEXT: vins.f16 s14, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s10 +; CHECK-LIS-NEXT: vmovx.f16 s19, s9 +; CHECK-LIS-NEXT: vins.f16 s9, s2 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s18, s8 +; CHECK-LIS-NEXT: vins.f16 s19, s11 +; CHECK-LIS-NEXT: vins.f16 s17, s13 +; CHECK-LIS-NEXT: vmov.f32 s2, s14 +; CHECK-LIS-NEXT: vmov.f32 s3, s9 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q4 +; CHECK-LIS-NEXT: vadd.f16 q0, q0, q1 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <48 x half>, ptr %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll index 6d6f9aca7188d..0c349c3aa8ec1 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -77,29 +77,32 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = SHL64ri [[COPY8]], 2, implicit-def dead $eflags ; CHECK-NEXT: MOV64mr %stack.10, 1, $noreg, 0, $noreg, [[COPY8]] :: (store (s64) into %stack.10) ; CHECK-NEXT: [[LEA64r1:%[0-9]+]]:gr64 = LEA64r $noreg, 4, [[MOVSX64rr32_3]], 0, $noreg + ; CHECK-NEXT: MOV64mr %stack.11, 1, $noreg, 0, $noreg, [[LEA64r1]] :: (store (s64) into %stack.11) ; CHECK-NEXT: MOV64mr %stack.4, 1, $noreg, 0, $noreg, [[MOVSX64rm32_]] :: (store (s64) into %stack.4) ; CHECK-NEXT: [[LEA64_32r3:%[0-9]+]]:gr32 = LEA64_32r [[COPY5]], 4, [[MOVSX64rm32_]], 0, $noreg - ; CHECK-NEXT: MOV32mr %stack.11, 1, $noreg, 0, $noreg, [[LEA64_32r3]] :: (store (s32) into %stack.11) + ; CHECK-NEXT: MOV32mr %stack.12, 1, $noreg, 0, $noreg, [[LEA64_32r3]] :: (store (s32) into %stack.12) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.for.cond14.preheader: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) ; CHECK-NEXT: CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16) + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) ; CHECK-NEXT: JCC_1 %bb.5, 13, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.for.body17.lr.ph: ; CHECK-NEXT: successors: %bb.6(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm]], [[MOVSX64rr32_]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) - ; CHECK-NEXT: MOV64mr %stack.12, 1, $noreg, 0, $noreg, [[MOV64rm]] :: (store (s64) into %stack.12) - ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.11, 1, $noreg, 0, $noreg :: (load (s32) from %stack.11) + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags + ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) + ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13) + ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12) ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]] - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) ; CHECK-NEXT: JMP_1 %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.for.cond.cleanup: @@ -108,12 +111,12 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: bb.5.for.cond.cleanup16: ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm3]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7) - ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) - ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm4]], implicit-def dead $eflags :: (store (s64) into %stack.9) - ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOV64rm3]] :: (store (s64) into %stack.6) - ; CHECK-NEXT: CMP64rm [[MOV64rm3]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8) + ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) + ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm5]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7) + ; CHECK-NEXT: [[MOV64rm6:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10) + ; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm6]], implicit-def dead $eflags :: (store (s64) into %stack.9) + ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOV64rm5]] :: (store (s64) into %stack.6) + ; CHECK-NEXT: CMP64rm [[MOV64rm5]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8) ; CHECK-NEXT: JCC_1 %bb.2, 12, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.4 ; CHECK-NEXT: {{ $}} @@ -121,11 +124,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit - ; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load (s64) from %stack.12) - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm5]], 1, [[MOVSX64rr32_]], 0, $noreg + ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[LEA64r1]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]] ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]] ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]] ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] @@ -134,9 +137,10 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]] ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]] ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]] ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) - ; CHECK-NEXT: [[MOV64rm6:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm6]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]] ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]] ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]] ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]] @@ -144,15 +148,15 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]] ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]] ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]] - ; CHECK-NEXT: [[LEA64r1:%[0-9]+]]:gr64 = COPY [[COPY11]] - ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]] + ; CHECK-NEXT: [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]] ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]] - ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm1]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] - ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm2]], [[MOVSX64rr32_3]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm1]], [[LEA64r1]], implicit-def dead $eflags + ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] + ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags - ; CHECK-NEXT: CMP64rr [[MOV64rm2]], [[MOV64rm7]], implicit-def $eflags + ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags ; CHECK-NEXT: JCC_1 %bb.6, 12, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.5 entry: diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll index 8699aaa505142..5969aae43f82e 100644 --- a/llvm/test/CodeGen/X86/abs.ll +++ b/llvm/test/CodeGen/X86/abs.ll @@ -339,9 +339,9 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %ecx @@ -352,18 +352,18 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind { ; X86-NEXT: negl %ecx ; X86-NEXT: cmovsl %esi, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %edi, %esi ; X86-NEXT: negl %esi -; X86-NEXT: cmovsl %ebx, %esi -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: negl %ebx -; X86-NEXT: cmovsl %ebp, %ebx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: negl %ebp -; X86-NEXT: cmovsl %edi, %ebp -; X86-NEXT: movl %eax, %edi +; X86-NEXT: cmovsl %edi, %esi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: negl %edi -; X86-NEXT: cmovsl %eax, %edi +; X86-NEXT: cmovsl %ebp, %edi +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: negl %ebp +; X86-NEXT: cmovsl %ebx, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: cmovsl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax @@ -375,9 +375,9 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ecx, 28(%edx) ; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl %edi, 20(%edx) +; X86-NEXT: movl %ebx, 20(%edx) ; X86-NEXT: movl %ebp, 16(%edx) -; X86-NEXT: movl %ebx, 12(%edx) +; X86-NEXT: movl %edi, 12(%edx) ; X86-NEXT: movl %esi, 8(%edx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: movl %eax, 4(%edx) @@ -415,9 +415,9 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %ecx @@ -428,18 +428,18 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind { ; X86-NEXT: negw %cx ; X86-NEXT: cmovsw %si, %cx ; X86-NEXT: movw %cx, (%esp) # 2-byte Spill -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %edi, %esi ; X86-NEXT: negw %si -; X86-NEXT: cmovsw %bx, %si -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: negw %bx -; X86-NEXT: cmovsw %bp, %bx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: negw %bp -; X86-NEXT: cmovsw %di, %bp -; X86-NEXT: movl %eax, %edi +; X86-NEXT: cmovsw %di, %si +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: negw %di -; X86-NEXT: cmovsw %ax, %di +; X86-NEXT: cmovsw %bp, %di +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: negw %bp +; X86-NEXT: cmovsw %bx, %bp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: negw %bx +; X86-NEXT: cmovsw %ax, %bx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negw %ax @@ -451,9 +451,9 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movw %cx, 14(%edx) ; X86-NEXT: movw %ax, 12(%edx) -; X86-NEXT: movw %di, 10(%edx) +; X86-NEXT: movw %bx, 10(%edx) ; X86-NEXT: movw %bp, 8(%edx) -; X86-NEXT: movw %bx, 6(%edx) +; X86-NEXT: movw %di, 6(%edx) ; X86-NEXT: movw %si, 4(%edx) ; X86-NEXT: movzwl (%esp), %eax # 2-byte Folded Reload ; X86-NEXT: movw %ax, 2(%edx) @@ -486,7 +486,6 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-LABEL: test_v16i8: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi ; X86-NEXT: subl $12, %esp ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: movb {{[0-9]+}}(%esp), %bl @@ -542,6 +541,12 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-NEXT: xorb %al, %cl ; X86-NEXT: subb %al, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: xorb %al, %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: movb %bh, %al ; X86-NEXT: sarb $7, %al @@ -572,40 +577,34 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { ; X86-NEXT: sarb $7, %al ; X86-NEXT: xorb %al, %cl ; X86-NEXT: subb %al, %cl -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movb %al, %ah -; X86-NEXT: sarb $7, %ah -; X86-NEXT: xorb %ah, %al -; X86-NEXT: subb %ah, %al -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movb %al, 15(%esi) -; X86-NEXT: movb %cl, 14(%esi) -; X86-NEXT: movb %dl, 13(%esi) -; X86-NEXT: movb %ch, 12(%esi) -; X86-NEXT: movb %dh, 11(%esi) -; X86-NEXT: movb %bl, 10(%esi) -; X86-NEXT: movb %bh, 9(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 8(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 7(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 6(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 5(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 4(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 3(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 2(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, 1(%esi) -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movb %al, (%esi) -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %cl, 15(%eax) +; X86-NEXT: movb %dl, 14(%eax) +; X86-NEXT: movb %ch, 13(%eax) +; X86-NEXT: movb %dh, 12(%eax) +; X86-NEXT: movb %bl, 11(%eax) +; X86-NEXT: movb %bh, 10(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 9(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 8(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 7(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 6(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 5(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 4(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 3(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 2(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, 1(%eax) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $12, %esp -; X86-NEXT: popl %esi ; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 %r = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 64e2afc1753cc..6d5f8a78cb1d7 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -5095,16 +5095,16 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq 16(%rdi), %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: movq %rax, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: movl %eax, %r11d -; AVX-NEXT: movl %eax, %ebx -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq 16(%rdi), %rcx +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: movq %rcx, %r8 +; AVX-NEXT: movq %rcx, %r9 +; AVX-NEXT: movq %rcx, %r10 +; AVX-NEXT: movl %ecx, %r11d +; AVX-NEXT: movl %ecx, %ebx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: shrl $8, %ecx +; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; AVX-NEXT: shrl $16, %ebx ; AVX-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 ; AVX-NEXT: shrl $24, %r11d @@ -5115,74 +5115,74 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX-NEXT: shrq $48, %r8 ; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX-NEXT: movq 24(%rdi), %rax -; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $24, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $40, %rcx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $48, %rcx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq (%rdi), %rcx +; AVX-NEXT: movq 24(%rdi), %rcx ; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $24, %eax -; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $32, %rax -; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $40, %rax -; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: movq 8(%rdi), %rax +; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq (%rdi), %rax ; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $24, %ecx -; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $40, %rcx -; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $48, %rcx -; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movq 8(%rdi), %rcx ; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $24, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $40, %rax +; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrq $48, %rax +; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrq $56, %rcx +; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 @@ -5197,16 +5197,16 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movq 16(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movl %eax, %r11d -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq 16(%rdi), %rcx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: movq %rcx, %r8 +; AVX2-NEXT: movq %rcx, %r9 +; AVX2-NEXT: movq %rcx, %r10 +; AVX2-NEXT: movl %ecx, %r11d +; AVX2-NEXT: movl %ecx, %ebx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: shrl $8, %ecx +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: shrl $16, %ebx ; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 ; AVX2-NEXT: shrl $24, %r11d @@ -5217,74 +5217,74 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr % ; AVX2-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX2-NEXT: shrq $48, %r8 ; AVX2-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 24(%rdi), %rcx ; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $24, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $40, %rax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $48, %rax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: movq 8(%rdi), %rax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq (%rdi), %rax ; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movq 8(%rdi), %rcx ; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: shrl $24, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $32, %rax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $40, %rax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $48, %rax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shrq $56, %rcx +; AVX2-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index d42b994357447..3e7d1138132c4 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1746,7 +1746,7 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d @@ -1758,56 +1758,56 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %rbp, %rax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r13,%rbp), %r13 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r12,%rbp), %r12 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r15,%rbp), %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r14,%rbp), %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rbx,%rbp), %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r11,%rbp), %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r10,%rbp), %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r9,%rbp), %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r8,%rbp), %r8 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rdi,%rbp), %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rsi,%rbp), %rsi +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rdx,%rbp), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: addq %rdx, %rbp ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: xorl %ebp, %ebp -; SSE2-NEXT: addq $-1, %rcx -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: adcq $-1, %rdx -; SSE2-NEXT: addq $-1, %rax -; SSE2-NEXT: adcq $-1, %rbp -; SSE2-NEXT: shldq $63, %rax, %rbp -; SSE2-NEXT: shldq $63, %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: movq %rbp, %xmm0 +; SSE2-NEXT: addq %rcx, %rdx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r13,%rcx), %r13 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r12,%rcx), %r12 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r15,%rcx), %r15 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r14,%rcx), %r14 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rbx,%rcx), %rbx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r11,%rcx), %r11 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r10,%rcx), %r10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r9,%rcx), %r9 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%r8,%rcx), %r8 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rdi,%rcx), %rdi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rsi,%rcx), %rsi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: leaq -1(%rax,%rcx), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: leaq -1(%rcx,%rax), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: leaq -1(%rcx,%rax), %rax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: addq $-1, %rbp +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: adcq $-1, %rax +; SSE2-NEXT: addq $-1, %rdx +; SSE2-NEXT: adcq $-1, %rcx +; SSE2-NEXT: shldq $63, %rdx, %rcx +; SSE2-NEXT: shldq $63, %rbp, %rax +; SSE2-NEXT: movq %rax, %xmm1 +; SSE2-NEXT: movq %rcx, %xmm0 ; SSE2-NEXT: shrq %r13 ; SSE2-NEXT: movq %r13, %xmm3 ; SSE2-NEXT: shrq %r12 @@ -1825,14 +1825,14 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: shrq %r9 ; SSE2-NEXT: movq %r9, %xmm8 ; SSE2-NEXT: shrq %r8 -; SSE2-NEXT: movq %r8, %xmm10 +; SSE2-NEXT: movq %r8, %xmm11 ; SSE2-NEXT: shrq %rdi -; SSE2-NEXT: movq %rdi, %xmm11 +; SSE2-NEXT: movq %rdi, %xmm12 ; SSE2-NEXT: shrq %rsi -; SSE2-NEXT: movq %rsi, %xmm12 +; SSE2-NEXT: movq %rsi, %xmm13 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm13 +; SSE2-NEXT: movq %rax, %xmm10 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shrq %rax ; SSE2-NEXT: movq %rax, %xmm14 @@ -1857,18 +1857,18 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; SSE2-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE2-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm13, %xmm2 +; SSE2-NEXT: pandn %xmm10, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -1898,67 +1898,67 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: vpextrw $5, %xmm0, %eax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm0, %r10d -; AVX1-NEXT: vpextrw $7, %xmm0, %edx +; AVX1-NEXT: vpextrw $6, %xmm0, %ebx +; AVX1-NEXT: vpextrw $7, %xmm0, %esi ; AVX1-NEXT: vpextrw $0, %xmm3, %edi ; AVX1-NEXT: vpextrw $1, %xmm3, %r8d ; AVX1-NEXT: vpextrw $2, %xmm3, %r9d -; AVX1-NEXT: vpextrw $3, %xmm3, %r11d -; AVX1-NEXT: vpextrw $4, %xmm3, %ebx +; AVX1-NEXT: vpextrw $3, %xmm3, %r10d +; AVX1-NEXT: vpextrw $4, %xmm3, %r11d ; AVX1-NEXT: vpextrw $5, %xmm3, %r14d ; AVX1-NEXT: vpextrw $6, %xmm3, %r15d -; AVX1-NEXT: vpextrw $7, %xmm3, %esi -; AVX1-NEXT: vpextrw $1, %xmm0, %r13d +; AVX1-NEXT: vpextrw $7, %xmm3, %edx +; AVX1-NEXT: vpextrw $1, %xmm0, %eax ; AVX1-NEXT: vpextrw $0, %xmm0, %r12d ; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: addq %r13, %rcx +; AVX1-NEXT: addq %rax, %rcx ; AVX1-NEXT: vpextrw $0, %xmm1, %eax ; AVX1-NEXT: addq %r12, %rax ; AVX1-NEXT: vpextrw $7, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%rsi,%r12), %rsi +; AVX1-NEXT: leaq -1(%rdx,%r12), %rdx ; AVX1-NEXT: vpextrw $6, %xmm2, %r12d ; AVX1-NEXT: leaq -1(%r15,%r12), %rbp ; AVX1-NEXT: vpextrw $5, %xmm2, %r15d ; AVX1-NEXT: leaq -1(%r14,%r15), %r13 ; AVX1-NEXT: vpextrw $4, %xmm2, %r14d -; AVX1-NEXT: leaq -1(%rbx,%r14), %r12 -; AVX1-NEXT: vpextrw $3, %xmm2, %ebx -; AVX1-NEXT: leaq -1(%r11,%rbx), %r15 -; AVX1-NEXT: vpextrw $2, %xmm2, %r11d -; AVX1-NEXT: leaq -1(%r9,%r11), %r14 +; AVX1-NEXT: leaq -1(%r11,%r14), %r12 +; AVX1-NEXT: vpextrw $3, %xmm2, %r11d +; AVX1-NEXT: leaq -1(%r10,%r11), %r15 +; AVX1-NEXT: vpextrw $2, %xmm2, %r10d +; AVX1-NEXT: leaq -1(%r9,%r10), %r14 ; AVX1-NEXT: vpextrw $1, %xmm2, %r9d -; AVX1-NEXT: leaq -1(%r8,%r9), %rbx +; AVX1-NEXT: leaq -1(%r8,%r9), %r11 ; AVX1-NEXT: vpextrw $0, %xmm2, %r8d -; AVX1-NEXT: leaq -1(%rdi,%r8), %r11 +; AVX1-NEXT: leaq -1(%rdi,%r8), %r10 ; AVX1-NEXT: vpextrw $7, %xmm1, %edi -; AVX1-NEXT: leaq -1(%rdx,%rdi), %r9 -; AVX1-NEXT: vpextrw $6, %xmm1, %edx -; AVX1-NEXT: leaq -1(%r10,%rdx), %r8 -; AVX1-NEXT: vpextrw $5, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rsi,%rdi), %r9 +; AVX1-NEXT: vpextrw $6, %xmm1, %esi +; AVX1-NEXT: leaq -1(%rbx,%rsi), %r8 +; AVX1-NEXT: vpextrw $5, %xmm1, %esi ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi -; AVX1-NEXT: vpextrw $4, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX1-NEXT: leaq -1(%r10,%rdx), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: vpextrw $3, %xmm1, %r10d -; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: vpextrw $2, %xmm1, %r10d -; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: xorl %edx, %edx +; AVX1-NEXT: leaq -1(%rdi,%rsi), %rsi +; AVX1-NEXT: vpextrw $4, %xmm1, %edi +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; AVX1-NEXT: leaq -1(%rbx,%rdi), %rdi +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm0, %edi +; AVX1-NEXT: vpextrw $3, %xmm1, %ebx +; AVX1-NEXT: leaq -1(%rdi,%rbx), %rdi +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $2, %xmm0, %edi +; AVX1-NEXT: vpextrw $2, %xmm1, %ebx +; AVX1-NEXT: leaq -1(%rdi,%rbx), %rdi +; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: xorl %edi, %edi ; AVX1-NEXT: addq $-1, %rcx -; AVX1-NEXT: movl $0, %r10d -; AVX1-NEXT: adcq $-1, %r10 +; AVX1-NEXT: movl $0, %ebx +; AVX1-NEXT: adcq $-1, %rbx ; AVX1-NEXT: addq $-1, %rax -; AVX1-NEXT: adcq $-1, %rdx -; AVX1-NEXT: shldq $63, %rax, %rdx -; AVX1-NEXT: shldq $63, %rcx, %r10 -; AVX1-NEXT: shrq %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 +; AVX1-NEXT: adcq $-1, %rdi +; AVX1-NEXT: shldq $63, %rax, %rdi +; AVX1-NEXT: shldq $63, %rcx, %rbx +; AVX1-NEXT: shrq %rdx +; AVX1-NEXT: vmovq %rdx, %xmm0 ; AVX1-NEXT: shrq %rbp ; AVX1-NEXT: vmovq %rbp, %xmm1 ; AVX1-NEXT: shrq %r13 @@ -1969,21 +1969,21 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX1-NEXT: vmovq %r15, %xmm4 ; AVX1-NEXT: shrq %r14 ; AVX1-NEXT: vmovq %r14, %xmm5 -; AVX1-NEXT: shrq %rbx -; AVX1-NEXT: vmovq %rbx, %xmm6 ; AVX1-NEXT: shrq %r11 -; AVX1-NEXT: vmovq %r11, %xmm7 +; AVX1-NEXT: vmovq %r11, %xmm6 +; AVX1-NEXT: shrq %r10 +; AVX1-NEXT: vmovq %r10, %xmm7 ; AVX1-NEXT: shrq %r9 ; AVX1-NEXT: vmovq %r9, %xmm8 ; AVX1-NEXT: shrq %r8 ; AVX1-NEXT: vmovq %r8, %xmm9 -; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: vmovq %rdi, %xmm10 +; AVX1-NEXT: shrq %rsi +; AVX1-NEXT: vmovq %rsi, %xmm10 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: vmovq %rax, %xmm11 -; AVX1-NEXT: vmovq %r10, %xmm12 -; AVX1-NEXT: vmovq %rdx, %xmm13 +; AVX1-NEXT: vmovq %rbx, %xmm12 +; AVX1-NEXT: vmovq %rdi, %xmm13 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: vmovq %rax, %xmm14 @@ -2030,143 +2030,143 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vmovq %xmm7, %r13 +; AVX2-NEXT: vmovq %xmm7, %rsi ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm2, %rbp +; AVX2-NEXT: vmovq %xmm2, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX2-NEXT: vmovq %xmm8, %r8 -; AVX2-NEXT: vpextrq $1, %xmm8, %r15 +; AVX2-NEXT: vpextrq $1, %xmm8, %r13 ; AVX2-NEXT: vpextrq $1, %xmm2, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %rbx -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: vpextrq $1, %xmm5, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vmovq %xmm3, %rdi -; AVX2-NEXT: vpextrq $1, %xmm0, %r10 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vpextrq $1, %xmm7, %r15 +; AVX2-NEXT: vpextrq $1, %xmm6, %r12 +; AVX2-NEXT: vpextrq $1, %xmm4, %rbx +; AVX2-NEXT: vpextrq $1, %xmm1, %rdi +; AVX2-NEXT: vpextrq $1, %xmm3, %rcx +; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %r11 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpextrq $1, %xmm9, %r11 -; AVX2-NEXT: addq %r15, %r11 +; AVX2-NEXT: vpextrq $1, %xmm9, %r9 +; AVX2-NEXT: addq %r13, %r9 +; AVX2-NEXT: movq %r9, %r13 ; AVX2-NEXT: vpextrq $1, %xmm8, %r9 ; AVX2-NEXT: addq %r14, %r9 ; AVX2-NEXT: movq %r9, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %r9 -; AVX2-NEXT: addq %rbx, %r9 -; AVX2-NEXT: movq %r9, %rbx -; AVX2-NEXT: vpextrq $1, %xmm4, %r15 -; AVX2-NEXT: addq %rsi, %r15 -; AVX2-NEXT: vpextrq $1, %xmm5, %r12 -; AVX2-NEXT: addq %rdx, %r12 -; AVX2-NEXT: vpextrq $1, %xmm3, %r9 +; AVX2-NEXT: vpextrq $1, %xmm7, %r10 +; AVX2-NEXT: addq %r15, %r10 +; AVX2-NEXT: vpextrq $1, %xmm5, %r15 +; AVX2-NEXT: addq %r12, %r15 +; AVX2-NEXT: vpextrq $1, %xmm4, %r12 +; AVX2-NEXT: addq %rbx, %r12 +; AVX2-NEXT: vpextrq $1, %xmm3, %rbp +; AVX2-NEXT: addq %rdi, %rbp +; AVX2-NEXT: vpextrq $1, %xmm6, %r9 ; AVX2-NEXT: addq %rcx, %r9 -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: addq %rax, %rsi -; AVX2-NEXT: vmovq %xmm6, %rdx -; AVX2-NEXT: addq %rdi, %rdx +; AVX2-NEXT: vmovq %xmm6, %rdi +; AVX2-NEXT: addq %rax, %rdi ; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r10, %rcx -; AVX2-NEXT: vmovq %xmm9, %r10 -; AVX2-NEXT: leaq -1(%r8,%r10), %rax +; AVX2-NEXT: addq %r11, %rcx +; AVX2-NEXT: vmovq %xmm9, %r11 +; AVX2-NEXT: leaq -1(%r8,%r11), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm8, %rdi -; AVX2-NEXT: leaq -1(%rbp,%rdi), %rax +; AVX2-NEXT: vmovq %xmm8, %r8 +; AVX2-NEXT: leaq -1(%rdx,%r8), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm7, %rdi -; AVX2-NEXT: leaq -1(%r13,%rdi), %rax +; AVX2-NEXT: vmovq %xmm7, %rdx +; AVX2-NEXT: leaq -1(%rsi,%rdx), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm4, %rdi +; AVX2-NEXT: vmovq %xmm5, %rdx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax +; AVX2-NEXT: leaq -1(%rax,%rdx), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm5, %rdi +; AVX2-NEXT: vmovq %xmm4, %rdx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax +; AVX2-NEXT: leaq -1(%rax,%rdx), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: vmovq %xmm3, %r8 -; AVX2-NEXT: leaq -1(%rdi,%r8), %rax +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vmovq %xmm3, %rsi +; AVX2-NEXT: leaq -1(%rdx,%rsi), %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: vmovq %xmm2, %r8 -; AVX2-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: addq $-1, %r11 -; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r8d -; AVX2-NEXT: adcq $-1, %r8 +; AVX2-NEXT: vmovq %xmm0, %rdx +; AVX2-NEXT: vmovq %xmm2, %rsi +; AVX2-NEXT: leaq -1(%rdx,%rsi), %rdx +; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r13 +; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %edx +; AVX2-NEXT: adcq $-1, %rdx ; AVX2-NEXT: addq $-1, %r14 ; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %edi -; AVX2-NEXT: adcq $-1, %rdi -; AVX2-NEXT: addq $-1, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: adcq $-1, %r11 +; AVX2-NEXT: movl $0, %esi +; AVX2-NEXT: adcq $-1, %rsi +; AVX2-NEXT: addq $-1, %r10 +; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r8d +; AVX2-NEXT: adcq $-1, %r8 ; AVX2-NEXT: addq $-1, %r15 ; AVX2-NEXT: movl $0, %r10d ; AVX2-NEXT: adcq $-1, %r10 ; AVX2-NEXT: addq $-1, %r12 +; AVX2-NEXT: movl $0, %ebx +; AVX2-NEXT: adcq $-1, %rbx +; AVX2-NEXT: addq $-1, %rbp ; AVX2-NEXT: movl $0, %r14d ; AVX2-NEXT: adcq $-1, %r14 ; AVX2-NEXT: addq $-1, %r9 -; AVX2-NEXT: movl $0, %ebp -; AVX2-NEXT: adcq $-1, %rbp -; AVX2-NEXT: addq $-1, %rsi ; AVX2-NEXT: movl $0, %r13d ; AVX2-NEXT: adcq $-1, %r13 -; AVX2-NEXT: addq $-1, %rdx -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: adcq $-1, %rbx +; AVX2-NEXT: addq $-1, %rdi +; AVX2-NEXT: movl $0, %r11d +; AVX2-NEXT: adcq $-1, %r11 ; AVX2-NEXT: addq $-1, %rcx ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: adcq $-1, %rax ; AVX2-NEXT: shldq $63, %rcx, %rax -; AVX2-NEXT: shldq $63, %rdx, %rbx -; AVX2-NEXT: shldq $63, %rsi, %r13 -; AVX2-NEXT: shldq $63, %r9, %rbp -; AVX2-NEXT: shldq $63, %r12, %r14 +; AVX2-NEXT: shldq $63, %rdi, %r11 +; AVX2-NEXT: shldq $63, %r9, %r13 +; AVX2-NEXT: shldq $63, %rbp, %r14 +; AVX2-NEXT: shldq $63, %r12, %rbx ; AVX2-NEXT: shldq $63, %r15, %r10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r11 +; AVX2-NEXT: shldq $63, %rcx, %r8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rdi +; AVX2-NEXT: shldq $63, %rcx, %rsi ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r8 -; AVX2-NEXT: vmovq %r8, %xmm0 +; AVX2-NEXT: shldq $63, %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm0 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vmovq %rdi, %xmm2 +; AVX2-NEXT: vmovq %rsi, %xmm2 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm3 -; AVX2-NEXT: vmovq %r11, %xmm4 +; AVX2-NEXT: vmovq %r8, %xmm4 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm5 @@ -2174,16 +2174,16 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm7 -; AVX2-NEXT: vmovq %r14, %xmm8 +; AVX2-NEXT: vmovq %rbx, %xmm8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm9 -; AVX2-NEXT: vmovq %rbp, %xmm10 +; AVX2-NEXT: vmovq %r14, %xmm10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: vmovq %rcx, %xmm11 ; AVX2-NEXT: vmovq %r13, %xmm12 -; AVX2-NEXT: vmovq %rbx, %xmm13 +; AVX2-NEXT: vmovq %r11, %xmm13 ; AVX2-NEXT: vmovq %rax, %xmm14 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX2-NEXT: shrq %rax @@ -2228,118 +2228,117 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm4, %rbp -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-NEXT: vmovq %xmm4, %rdi -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vmovq %xmm5, %r8 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vmovq %xmm3, %r9 -; AVX512-NEXT: vpextrq $1, %xmm3, %r10 +; AVX512-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %r11 -; AVX512-NEXT: vpextrq $1, %xmm3, %rbx -; AVX512-NEXT: vpextrq $1, %xmm5, %rax -; AVX512-NEXT: vpextrq $1, %xmm4, %r12 -; AVX512-NEXT: vpextrq $1, %xmm1, %r15 -; AVX512-NEXT: vpextrq $1, %xmm0, %r14 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512-NEXT: vmovq %xmm3, %r13 +; AVX512-NEXT: vpextrq $1, %xmm3, %rsi +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vmovq %xmm3, %rdi +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512-NEXT: vmovq %xmm5, %r8 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vmovq %xmm2, %r9 +; AVX512-NEXT: vpextrq $1, %xmm2, %r10 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %r11 +; AVX512-NEXT: vpextrq $1, %xmm2, %rbx +; AVX512-NEXT: vpextrq $1, %xmm5, %rdx +; AVX512-NEXT: vpextrq $1, %xmm3, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %r14 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512-NEXT: vpextrq $1, %xmm8, %rsi -; AVX512-NEXT: addq %rax, %rsi -; AVX512-NEXT: vpextrq $1, %xmm7, %rdx -; AVX512-NEXT: addq %r12, %rdx -; AVX512-NEXT: vpextrq $1, %xmm4, %rcx -; AVX512-NEXT: addq %r15, %rcx -; AVX512-NEXT: vpextrq $1, %xmm3, %rax +; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512-NEXT: vpextrq $1, %xmm8, %rbp +; AVX512-NEXT: addq %rdx, %rbp +; AVX512-NEXT: vpextrq $1, %xmm4, %rdx +; AVX512-NEXT: addq %rcx, %rdx +; AVX512-NEXT: vpextrq $1, %xmm3, %rcx +; AVX512-NEXT: addq %rax, %rcx +; AVX512-NEXT: vpextrq $1, %xmm2, %rax ; AVX512-NEXT: addq %r14, %rax ; AVX512-NEXT: vpextrq $1, %xmm9, %r14 -; AVX512-NEXT: leaq -1(%rbx,%r14), %r13 +; AVX512-NEXT: leaq -1(%rbx,%r14), %r12 ; AVX512-NEXT: vmovq %xmm9, %rbx -; AVX512-NEXT: leaq -1(%r11,%rbx), %r12 -; AVX512-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512-NEXT: leaq -1(%r10,%r11), %r15 -; AVX512-NEXT: vmovq %xmm2, %r10 -; AVX512-NEXT: leaq -1(%r9,%r10), %r14 +; AVX512-NEXT: leaq -1(%r11,%rbx), %r15 +; AVX512-NEXT: vpextrq $1, %xmm7, %r11 +; AVX512-NEXT: leaq -1(%r10,%r11), %r14 +; AVX512-NEXT: vmovq %xmm7, %r10 +; AVX512-NEXT: leaq -1(%r9,%r10), %rbx ; AVX512-NEXT: vmovq %xmm8, %r9 ; AVX512-NEXT: leaq -1(%r8,%r9), %r11 -; AVX512-NEXT: vmovq %xmm7, %r8 +; AVX512-NEXT: vmovq %xmm4, %r8 ; AVX512-NEXT: leaq -1(%rdi,%r8), %r10 ; AVX512-NEXT: vpextrq $1, %xmm6, %rdi -; AVX512-NEXT: leaq -1(%rbp,%rdi), %r9 -; AVX512-NEXT: vmovq %xmm6, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vpextrq $1, %xmm5, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm5, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm1, %rdi -; AVX512-NEXT: vmovq %xmm4, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm0, %rdi -; AVX512-NEXT: vmovq %xmm3, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: leaq -1(%rsi,%rdi), %r9 +; AVX512-NEXT: vmovq %xmm6, %rsi +; AVX512-NEXT: leaq -1(%r13,%rsi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm5, %rsi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX512-NEXT: leaq -1(%rdi,%rsi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm5, %rsi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; AVX512-NEXT: leaq -1(%rdi,%rsi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm1, %rsi +; AVX512-NEXT: vmovq %xmm3, %rdi +; AVX512-NEXT: leaq -1(%rsi,%rdi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm0, %rsi +; AVX512-NEXT: vmovq %xmm2, %rdi +; AVX512-NEXT: leaq -1(%rsi,%rdi), %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: addq $-1, %rsi +; AVX512-NEXT: addq $-1, %rbp +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: adcq $-1, %rsi +; AVX512-NEXT: addq $-1, %rdx ; AVX512-NEXT: movl $0, %edi ; AVX512-NEXT: adcq $-1, %rdi -; AVX512-NEXT: addq $-1, %rdx -; AVX512-NEXT: movl $0, %ebp -; AVX512-NEXT: adcq $-1, %rbp ; AVX512-NEXT: addq $-1, %rcx -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: adcq $-1, %rbx +; AVX512-NEXT: movl $0, %r13d +; AVX512-NEXT: adcq $-1, %r13 ; AVX512-NEXT: addq $-1, %rax ; AVX512-NEXT: adcq $-1, %r8 ; AVX512-NEXT: shldq $63, %rax, %r8 -; AVX512-NEXT: shldq $63, %rcx, %rbx -; AVX512-NEXT: shldq $63, %rdx, %rbp -; AVX512-NEXT: shldq $63, %rsi, %rdi -; AVX512-NEXT: shrq %r13 -; AVX512-NEXT: vmovq %r13, %xmm0 +; AVX512-NEXT: shldq $63, %rcx, %r13 +; AVX512-NEXT: shldq $63, %rdx, %rdi +; AVX512-NEXT: shldq $63, %rbp, %rsi ; AVX512-NEXT: shrq %r12 -; AVX512-NEXT: vmovq %r12, %xmm1 +; AVX512-NEXT: vmovq %r12, %xmm0 ; AVX512-NEXT: shrq %r15 -; AVX512-NEXT: vmovq %r15, %xmm2 +; AVX512-NEXT: vmovq %r15, %xmm1 ; AVX512-NEXT: shrq %r14 -; AVX512-NEXT: vmovq %r14, %xmm3 -; AVX512-NEXT: vmovq %rdi, %xmm4 +; AVX512-NEXT: vmovq %r14, %xmm2 +; AVX512-NEXT: shrq %rbx +; AVX512-NEXT: vmovq %rbx, %xmm3 +; AVX512-NEXT: vmovq %rsi, %xmm4 ; AVX512-NEXT: shrq %r11 ; AVX512-NEXT: vmovq %r11, %xmm5 -; AVX512-NEXT: vmovq %rbp, %xmm6 +; AVX512-NEXT: vmovq %rdi, %xmm6 ; AVX512-NEXT: shrq %r10 ; AVX512-NEXT: vmovq %r10, %xmm7 ; AVX512-NEXT: shrq %r9 @@ -2353,7 +2352,7 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shrq %rax ; AVX512-NEXT: vmovq %rax, %xmm11 -; AVX512-NEXT: vmovq %rbx, %xmm12 +; AVX512-NEXT: vmovq %r13, %xmm12 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shrq %rax ; AVX512-NEXT: vmovq %rax, %xmm13 diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 9411aad9a21e4..b39b089faa2a5 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -910,27 +910,27 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: kmovw %r10d, %k2 ; KNL-NEXT: kandw %k1, %k2, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: kshiftrw $1, %k0, %k1 ; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kshiftrw $1, %k0, %k1 +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftrw $3, %k0, %k1 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftrw $4, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftrw $5, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftrw $6, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftrw $7, %k0, %k1 -; KNL-NEXT: kmovw %k1, %r11d -; KNL-NEXT: kshiftrw $8, %k0, %k1 ; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: kshiftrw $8, %k0, %k1 +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftrw $9, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftrw $10, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftrw $11, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftrw $12, %k0, %k1 @@ -938,40 +938,40 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kshiftrw $13, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movb %cl, 2(%rax) -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl $1, %ecx ; KNL-NEXT: andl $1, %edx -; KNL-NEXT: leal (%rcx,%rdx,2), %ecx +; KNL-NEXT: movb %dl, 2(%rax) +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: andl $1, %edx +; KNL-NEXT: andl $1, %r9d +; KNL-NEXT: leal (%rdx,%r9,2), %r9d ; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: andl $1, %r8d +; KNL-NEXT: leal (%r9,%r8,4), %r9d +; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: andl $1, %esi -; KNL-NEXT: leal (%rcx,%rsi,4), %ecx -; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: leal (%r9,%rsi,8), %esi ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: leal (%rcx,%rdi,8), %ecx -; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: shll $4, %r9d -; KNL-NEXT: orl %ecx, %r9d -; KNL-NEXT: andl $1, %r8d -; KNL-NEXT: shll $5, %r8d -; KNL-NEXT: orl %r9d, %r8d +; KNL-NEXT: shll $4, %edi +; KNL-NEXT: orl %esi, %edi +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: shll $5, %ecx +; KNL-NEXT: orl %edi, %ecx ; KNL-NEXT: andl $1, %r10d ; KNL-NEXT: shll $6, %r10d -; KNL-NEXT: andl $1, %r11d -; KNL-NEXT: shll $7, %r11d -; KNL-NEXT: orl %r10d, %r11d ; KNL-NEXT: andl $1, %ebx -; KNL-NEXT: shll $8, %ebx -; KNL-NEXT: orl %r11d, %ebx +; KNL-NEXT: shll $7, %ebx +; KNL-NEXT: orl %r10d, %ebx +; KNL-NEXT: andl $1, %ebp +; KNL-NEXT: shll $8, %ebp +; KNL-NEXT: orl %ebx, %ebp ; KNL-NEXT: andl $1, %r14d ; KNL-NEXT: shll $9, %r14d -; KNL-NEXT: orl %ebx, %r14d -; KNL-NEXT: andl $1, %ebp -; KNL-NEXT: shll $10, %ebp -; KNL-NEXT: orl %r14d, %ebp -; KNL-NEXT: orl %r8d, %ebp +; KNL-NEXT: orl %ebp, %r14d +; KNL-NEXT: andl $1, %r11d +; KNL-NEXT: shll $10, %r11d +; KNL-NEXT: orl %r14d, %r11d +; KNL-NEXT: orl %ecx, %r11d ; KNL-NEXT: andl $1, %r15d ; KNL-NEXT: shll $11, %r15d ; KNL-NEXT: andl $1, %r12d @@ -983,11 +983,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: andl $1, %edx ; KNL-NEXT: shll $14, %edx ; KNL-NEXT: orl %r13d, %edx -; KNL-NEXT: andl $1, %esi -; KNL-NEXT: shll $15, %esi -; KNL-NEXT: orl %edx, %esi -; KNL-NEXT: orl %ebp, %esi -; KNL-NEXT: movw %si, (%rax) +; KNL-NEXT: andl $1, %r8d +; KNL-NEXT: shll $15, %r8d +; KNL-NEXT: orl %edx, %r8d +; KNL-NEXT: orl %r11d, %r8d +; KNL-NEXT: movw %r8w, (%rax) ; KNL-NEXT: popq %rbx ; KNL-NEXT: popq %r12 ; KNL-NEXT: popq %r13 @@ -1085,16 +1085,16 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $21, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: movl $-2049, %edi ## imm = 0xF7FF -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SKX-NEXT: kandd %k1, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k6 +; SKX-NEXT: kandd %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $20, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: movl $-4097, %edi ## imm = 0xEFFF -; SKX-NEXT: kmovd %edi, %k6 -; SKX-NEXT: kandd %k6, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $19, %k1, %k1 @@ -1190,14 +1190,14 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $21, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload -; SKX-NEXT: kandd %k7, %k0, %k0 +; SKX-NEXT: kandd %k6, %k0, %k0 ; SKX-NEXT: kshiftld $31, %k1, %k1 ; SKX-NEXT: kshiftrd $20, %k1, %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 ; SKX-NEXT: kord %k1, %k0, %k0 -; SKX-NEXT: kandd %k6, %k0, %k0 -; SKX-NEXT: kshiftld $31, %k7, %k1 +; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload +; SKX-NEXT: kandd %k1, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k6, %k1 ; SKX-NEXT: kshiftrd $19, %k1, %k1 ; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kandd %k5, %k0, %k0 @@ -1223,27 +1223,27 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ecx -; SKX-NEXT: kshiftrd $1, %k0, %k1 ; SKX-NEXT: kmovd %k1, %edx +; SKX-NEXT: kshiftrd $1, %k0, %k1 +; SKX-NEXT: kmovd %k1, %r9d ; SKX-NEXT: kshiftrd $2, %k0, %k1 -; SKX-NEXT: kmovd %k1, %esi +; SKX-NEXT: kmovd %k1, %r8d ; SKX-NEXT: kshiftrd $3, %k0, %k1 -; SKX-NEXT: kmovd %k1, %edi +; SKX-NEXT: kmovd %k1, %esi ; SKX-NEXT: kshiftrd $4, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r9d +; SKX-NEXT: kmovd %k1, %edi ; SKX-NEXT: kshiftrd $5, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r8d +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: kshiftrd $6, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r10d ; SKX-NEXT: kshiftrd $7, %k0, %k1 -; SKX-NEXT: kmovd %k1, %r11d -; SKX-NEXT: kshiftrd $8, %k0, %k1 ; SKX-NEXT: kmovd %k1, %ebx +; SKX-NEXT: kshiftrd $8, %k0, %k1 +; SKX-NEXT: kmovd %k1, %ebp ; SKX-NEXT: kshiftrd $9, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r14d ; SKX-NEXT: kshiftrd $10, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ebp +; SKX-NEXT: kmovd %k1, %r11d ; SKX-NEXT: kshiftrd $11, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r15d ; SKX-NEXT: kshiftrd $12, %k0, %k1 @@ -1251,40 +1251,40 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $13, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r13d ; SKX-NEXT: kshiftrd $14, %k0, %k1 -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: movb %cl, 2(%rax) -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: andl $1, %ecx ; SKX-NEXT: andl $1, %edx -; SKX-NEXT: leal (%rcx,%rdx,2), %ecx +; SKX-NEXT: movb %dl, 2(%rax) +; SKX-NEXT: kmovd %k0, %edx +; SKX-NEXT: andl $1, %edx +; SKX-NEXT: andl $1, %r9d +; SKX-NEXT: leal (%rdx,%r9,2), %r9d ; SKX-NEXT: kmovd %k1, %edx ; SKX-NEXT: kshiftrd $15, %k0, %k0 +; SKX-NEXT: andl $1, %r8d +; SKX-NEXT: leal (%r9,%r8,4), %r9d +; SKX-NEXT: kmovd %k0, %r8d ; SKX-NEXT: andl $1, %esi -; SKX-NEXT: leal (%rcx,%rsi,4), %ecx -; SKX-NEXT: kmovd %k0, %esi +; SKX-NEXT: leal (%r9,%rsi,8), %esi ; SKX-NEXT: andl $1, %edi -; SKX-NEXT: leal (%rcx,%rdi,8), %ecx -; SKX-NEXT: andl $1, %r9d -; SKX-NEXT: shll $4, %r9d -; SKX-NEXT: orl %ecx, %r9d -; SKX-NEXT: andl $1, %r8d -; SKX-NEXT: shll $5, %r8d -; SKX-NEXT: orl %r9d, %r8d +; SKX-NEXT: shll $4, %edi +; SKX-NEXT: orl %esi, %edi +; SKX-NEXT: andl $1, %ecx +; SKX-NEXT: shll $5, %ecx +; SKX-NEXT: orl %edi, %ecx ; SKX-NEXT: andl $1, %r10d ; SKX-NEXT: shll $6, %r10d -; SKX-NEXT: andl $1, %r11d -; SKX-NEXT: shll $7, %r11d -; SKX-NEXT: orl %r10d, %r11d ; SKX-NEXT: andl $1, %ebx -; SKX-NEXT: shll $8, %ebx -; SKX-NEXT: orl %r11d, %ebx +; SKX-NEXT: shll $7, %ebx +; SKX-NEXT: orl %r10d, %ebx +; SKX-NEXT: andl $1, %ebp +; SKX-NEXT: shll $8, %ebp +; SKX-NEXT: orl %ebx, %ebp ; SKX-NEXT: andl $1, %r14d ; SKX-NEXT: shll $9, %r14d -; SKX-NEXT: orl %ebx, %r14d -; SKX-NEXT: andl $1, %ebp -; SKX-NEXT: shll $10, %ebp -; SKX-NEXT: orl %r14d, %ebp -; SKX-NEXT: orl %r8d, %ebp +; SKX-NEXT: orl %ebp, %r14d +; SKX-NEXT: andl $1, %r11d +; SKX-NEXT: shll $10, %r11d +; SKX-NEXT: orl %r14d, %r11d +; SKX-NEXT: orl %ecx, %r11d ; SKX-NEXT: andl $1, %r15d ; SKX-NEXT: shll $11, %r15d ; SKX-NEXT: andl $1, %r12d @@ -1296,11 +1296,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: andl $1, %edx ; SKX-NEXT: shll $14, %edx ; SKX-NEXT: orl %r13d, %edx -; SKX-NEXT: andl $1, %esi -; SKX-NEXT: shll $15, %esi -; SKX-NEXT: orl %edx, %esi -; SKX-NEXT: orl %ebp, %esi -; SKX-NEXT: movw %si, (%rax) +; SKX-NEXT: andl $1, %r8d +; SKX-NEXT: shll $15, %r8d +; SKX-NEXT: orl %edx, %r8d +; SKX-NEXT: orl %r11d, %r8d +; SKX-NEXT: movw %r8w, (%rax) ; SKX-NEXT: popq %rbx ; SKX-NEXT: popq %r12 ; SKX-NEXT: popq %r13 @@ -1726,16 +1726,16 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $21, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: movl $-2049, %edi ## imm = 0xF7FF -; FASTISEL-NEXT: kmovd %edi, %k1 -; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; FASTISEL-NEXT: kandd %k1, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k6 +; FASTISEL-NEXT: kandd %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $20, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: movl $-4097, %edi ## imm = 0xEFFF -; FASTISEL-NEXT: kmovd %edi, %k6 -; FASTISEL-NEXT: kandd %k6, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k1 +; FASTISEL-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; FASTISEL-NEXT: kandd %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $19, %k1, %k1 @@ -1831,14 +1831,14 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $21, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 4-byte Reload -; FASTISEL-NEXT: kandd %k7, %k0, %k0 +; FASTISEL-NEXT: kandd %k6, %k0, %k0 ; FASTISEL-NEXT: kshiftld $31, %k1, %k1 ; FASTISEL-NEXT: kshiftrd $20, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 ; FASTISEL-NEXT: kord %k1, %k0, %k0 -; FASTISEL-NEXT: kandd %k6, %k0, %k0 -; FASTISEL-NEXT: kshiftld $31, %k7, %k1 +; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload +; FASTISEL-NEXT: kandd %k1, %k0, %k0 +; FASTISEL-NEXT: kshiftld $31, %k6, %k1 ; FASTISEL-NEXT: kshiftrd $19, %k1, %k1 ; FASTISEL-NEXT: kord %k1, %k0, %k0 ; FASTISEL-NEXT: kandd %k5, %k0, %k0 @@ -1864,27 +1864,27 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload ; FASTISEL-NEXT: kandd %k1, %k0, %k0 ; FASTISEL-NEXT: kshiftrd $16, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %ecx -; FASTISEL-NEXT: kshiftrd $1, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %edx +; FASTISEL-NEXT: kshiftrd $1, %k0, %k1 +; FASTISEL-NEXT: kmovd %k1, %r9d ; FASTISEL-NEXT: kshiftrd $2, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %esi +; FASTISEL-NEXT: kmovd %k1, %r8d ; FASTISEL-NEXT: kshiftrd $3, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %edi +; FASTISEL-NEXT: kmovd %k1, %esi ; FASTISEL-NEXT: kshiftrd $4, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r9d +; FASTISEL-NEXT: kmovd %k1, %edi ; FASTISEL-NEXT: kshiftrd $5, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r8d +; FASTISEL-NEXT: kmovd %k1, %ecx ; FASTISEL-NEXT: kshiftrd $6, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r10d ; FASTISEL-NEXT: kshiftrd $7, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %r11d -; FASTISEL-NEXT: kshiftrd $8, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %ebx +; FASTISEL-NEXT: kshiftrd $8, %k0, %k1 +; FASTISEL-NEXT: kmovd %k1, %ebp ; FASTISEL-NEXT: kshiftrd $9, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r14d ; FASTISEL-NEXT: kshiftrd $10, %k0, %k1 -; FASTISEL-NEXT: kmovd %k1, %ebp +; FASTISEL-NEXT: kmovd %k1, %r11d ; FASTISEL-NEXT: kshiftrd $11, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r15d ; FASTISEL-NEXT: kshiftrd $12, %k0, %k1 @@ -1892,40 +1892,40 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $13, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r13d ; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 -; FASTISEL-NEXT: andl $1, %ecx -; FASTISEL-NEXT: movb %cl, 2(%rax) -; FASTISEL-NEXT: kmovd %k0, %ecx -; FASTISEL-NEXT: andl $1, %ecx ; FASTISEL-NEXT: andl $1, %edx -; FASTISEL-NEXT: leal (%rcx,%rdx,2), %ecx +; FASTISEL-NEXT: movb %dl, 2(%rax) +; FASTISEL-NEXT: kmovd %k0, %edx +; FASTISEL-NEXT: andl $1, %edx +; FASTISEL-NEXT: andl $1, %r9d +; FASTISEL-NEXT: leal (%rdx,%r9,2), %r9d ; FASTISEL-NEXT: kmovd %k1, %edx ; FASTISEL-NEXT: kshiftrd $15, %k0, %k0 +; FASTISEL-NEXT: andl $1, %r8d +; FASTISEL-NEXT: leal (%r9,%r8,4), %r9d +; FASTISEL-NEXT: kmovd %k0, %r8d ; FASTISEL-NEXT: andl $1, %esi -; FASTISEL-NEXT: leal (%rcx,%rsi,4), %ecx -; FASTISEL-NEXT: kmovd %k0, %esi +; FASTISEL-NEXT: leal (%r9,%rsi,8), %esi ; FASTISEL-NEXT: andl $1, %edi -; FASTISEL-NEXT: leal (%rcx,%rdi,8), %ecx -; FASTISEL-NEXT: andl $1, %r9d -; FASTISEL-NEXT: shll $4, %r9d -; FASTISEL-NEXT: orl %ecx, %r9d -; FASTISEL-NEXT: andl $1, %r8d -; FASTISEL-NEXT: shll $5, %r8d -; FASTISEL-NEXT: orl %r9d, %r8d +; FASTISEL-NEXT: shll $4, %edi +; FASTISEL-NEXT: orl %esi, %edi +; FASTISEL-NEXT: andl $1, %ecx +; FASTISEL-NEXT: shll $5, %ecx +; FASTISEL-NEXT: orl %edi, %ecx ; FASTISEL-NEXT: andl $1, %r10d ; FASTISEL-NEXT: shll $6, %r10d -; FASTISEL-NEXT: andl $1, %r11d -; FASTISEL-NEXT: shll $7, %r11d -; FASTISEL-NEXT: orl %r10d, %r11d ; FASTISEL-NEXT: andl $1, %ebx -; FASTISEL-NEXT: shll $8, %ebx -; FASTISEL-NEXT: orl %r11d, %ebx +; FASTISEL-NEXT: shll $7, %ebx +; FASTISEL-NEXT: orl %r10d, %ebx +; FASTISEL-NEXT: andl $1, %ebp +; FASTISEL-NEXT: shll $8, %ebp +; FASTISEL-NEXT: orl %ebx, %ebp ; FASTISEL-NEXT: andl $1, %r14d ; FASTISEL-NEXT: shll $9, %r14d -; FASTISEL-NEXT: orl %ebx, %r14d -; FASTISEL-NEXT: andl $1, %ebp -; FASTISEL-NEXT: shll $10, %ebp -; FASTISEL-NEXT: orl %r14d, %ebp -; FASTISEL-NEXT: orl %r8d, %ebp +; FASTISEL-NEXT: orl %ebp, %r14d +; FASTISEL-NEXT: andl $1, %r11d +; FASTISEL-NEXT: shll $10, %r11d +; FASTISEL-NEXT: orl %r14d, %r11d +; FASTISEL-NEXT: orl %ecx, %r11d ; FASTISEL-NEXT: andl $1, %r15d ; FASTISEL-NEXT: shll $11, %r15d ; FASTISEL-NEXT: andl $1, %r12d @@ -1937,11 +1937,11 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: andl $1, %edx ; FASTISEL-NEXT: shll $14, %edx ; FASTISEL-NEXT: orl %r13d, %edx -; FASTISEL-NEXT: andl $1, %esi -; FASTISEL-NEXT: shll $15, %esi -; FASTISEL-NEXT: orl %edx, %esi -; FASTISEL-NEXT: orl %ebp, %esi -; FASTISEL-NEXT: movw %si, (%rax) +; FASTISEL-NEXT: andl $1, %r8d +; FASTISEL-NEXT: shll $15, %r8d +; FASTISEL-NEXT: orl %edx, %r8d +; FASTISEL-NEXT: orl %r11d, %r8d +; FASTISEL-NEXT: movw %r8w, (%rax) ; FASTISEL-NEXT: popq %rbx ; FASTISEL-NEXT: popq %r12 ; FASTISEL-NEXT: popq %r13 @@ -2380,9 +2380,8 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: movb $-9, %dil -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k6 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k2 @@ -2403,13 +2402,14 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $2, %k4, %k4 ; SKX-NEXT: korb %k4, %k0, %k0 ; SKX-NEXT: movb $-65, %dil -; SKX-NEXT: kmovd %edi, %k6 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kmovq %k1, %k2 +; SKX-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: korb %k5, %k0, %k0 -; SKX-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: korb %k5, %k0, %k5 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k0 @@ -2417,15 +2417,14 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $7, %k7, %k7 ; SKX-NEXT: korb %k0, %k7, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $5, %k7, %k7 ; SKX-NEXT: korb %k7, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload -; SKX-NEXT: kandb %k5, %k0, %k1 +; SKX-NEXT: kandb %k6, %k0, %k1 ; SKX-NEXT: kshiftlb $7, %k7, %k7 ; SKX-NEXT: kshiftrb $4, %k7, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 @@ -2434,19 +2433,18 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $3, %k0, %k0 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; SKX-NEXT: kandb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kshiftlb $7, %k0, %k0 @@ -2455,29 +2453,31 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kmovq %k4, %k2 +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovq %k5, %k7 -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kmovq %k6, %k7 +; SKX-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kandb %k3, %k0, %k0 -; SKX-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload ; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k5, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload ; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 @@ -2515,27 +2515,26 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kandb %k6, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: korb %k5, %k1, %k5 -; SKX-NEXT: kshiftlb $7, %k7, %k1 -; SKX-NEXT: kshiftrb $6, %k1, %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kshiftlb $7, %k7, %k7 -; SKX-NEXT: kshiftrb $7, %k7, %k7 -; SKX-NEXT: korb %k1, %k7, %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kandb %k2, %k1, %k1 -; SKX-NEXT: kshiftlb $7, %k7, %k7 -; SKX-NEXT: kshiftrb $5, %k7, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; SKX-NEXT: korb %k7, %k1, %k1 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload -; SKX-NEXT: kandb %k3, %k1, %k1 +; SKX-NEXT: korb %k5, %k1, %k7 ; SKX-NEXT: kshiftlb $7, %k0, %k0 -; SKX-NEXT: kshiftrb $4, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; SKX-NEXT: kandb %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k5, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 +; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 @@ -2545,13 +2544,12 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k7, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload ; SKX-NEXT: kandb %k5, %k0, %k0 @@ -2563,29 +2561,27 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k7, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kandb %k4, %k0, %k0 -; SKX-NEXT: kmovq %k4, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k2, %k0, %k0 -; SKX-NEXT: kmovq %k2, %k3 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 @@ -2598,30 +2594,28 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kmovq %k5, %k4 -; SKX-NEXT: kandb %k5, %k0, %k0 +; SKX-NEXT: kandb %k2, %k0, %k0 +; SKX-NEXT: kmovq %k2, %k7 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload ; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandb %k6, %k0, %k2 +; SKX-NEXT: kandb %k3, %k0, %k2 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: korb %k1, %k2, %k1 -; SKX-NEXT: kmovq %k7, %k2 -; SKX-NEXT: kandb %k7, %k1, %k1 +; SKX-NEXT: kandb %k4, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $2, %k0, %k0 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kandb %k3, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 @@ -2636,32 +2630,28 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 ; SKX-NEXT: korb %k0, %k1, %k0 -; SKX-NEXT: kandb %k4, %k0, %k0 -; SKX-NEXT: kmovq %k4, %k7 +; SKX-NEXT: kandb %k7, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovq %k5, %k3 ; SKX-NEXT: kandb %k5, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kandb %k6, %k0, %k0 +; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kandb %k2, %k0, %k0 -; SKX-NEXT: kmovq %k2, %k5 +; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; SKX-NEXT: kandb %k4, %k0, %k0 +; SKX-NEXT: kandb %k6, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 @@ -2677,22 +2667,22 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $5, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k3, %k1, %k1 +; SKX-NEXT: kandb %k5, %k1, %k1 ; SKX-NEXT: kmovd %r8d, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k6, %k1, %k1 +; SKX-NEXT: kandb %k3, %k1, %k1 ; SKX-NEXT: kmovd %r9d, %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k5, %k1, %k1 +; SKX-NEXT: kandb %k4, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k3, %k2 ; SKX-NEXT: kshiftrb $2, %k2, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 -; SKX-NEXT: kandb %k4, %k1, %k1 +; SKX-NEXT: kandb %k6, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $1, %k2, %k2 @@ -3189,129 +3179,131 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftrb $3, %k2, %k2 ; FASTISEL-NEXT: korb %k2, %k0, %k0 ; FASTISEL-NEXT: movb $-33, %dil -; FASTISEL-NEXT: kmovd %edi, %k5 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovd %edi, %k6 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; FASTISEL-NEXT: kshiftlb $7, %k4, %k4 ; FASTISEL-NEXT: kshiftrb $2, %k4, %k4 ; FASTISEL-NEXT: korb %k4, %k0, %k0 ; FASTISEL-NEXT: movb $-65, %dil -; FASTISEL-NEXT: kmovd %edi, %k1 -; FASTISEL-NEXT: kandb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovq %k1, %k4 -; FASTISEL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 -; FASTISEL-NEXT: kshiftrb $1, %k6, %k6 -; FASTISEL-NEXT: korb %k6, %k0, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 -; FASTISEL-NEXT: kshiftrb $6, %k6, %k6 +; FASTISEL-NEXT: kmovd %edi, %k2 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 +; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 +; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $7, %k7, %k7 -; FASTISEL-NEXT: korb %k6, %k7, %k6 -; FASTISEL-NEXT: kandb %k3, %k6, %k6 +; FASTISEL-NEXT: korb %k0, %k7, %k0 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kmovq %k3, %k5 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $5, %k7, %k7 -; FASTISEL-NEXT: korb %k7, %k6, %k6 +; FASTISEL-NEXT: korb %k7, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k2, %k6, %k6 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k4, %k0, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $4, %k7, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: korb %k7, %k6, %k6 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k7, %k6, %k6 +; FASTISEL-NEXT: korb %k7, %k1, %k1 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k3, %k1, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $3, %k0, %k0 -; FASTISEL-NEXT: korb %k0, %k6, %k0 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 -; FASTISEL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 -; FASTISEL-NEXT: kshiftrb $2, %k6, %k6 -; FASTISEL-NEXT: korb %k6, %k0, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k6 -; FASTISEL-NEXT: kshiftrb $1, %k6, %k6 -; FASTISEL-NEXT: korb %k6, %k0, %k0 -; FASTISEL-NEXT: kandb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 -; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kmovq %k3, %k7 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k2, %k0, %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 -; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k6, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovq %k2, %k6 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kandb %k7, %k0, %k0 -; FASTISEL-NEXT: kmovq %k7, %k5 +; FASTISEL-NEXT: kmovq %k5, %k2 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovq %k4, %k7 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k1 +; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload ; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $7, %k5, %k5 +; FASTISEL-NEXT: korb %k1, %k5, %k1 +; FASTISEL-NEXT: kandb %k2, %k1, %k1 +; FASTISEL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $5, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $4, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k3, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $3, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k4, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $2, %k5, %k5 +; FASTISEL-NEXT: korb %k5, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; FASTISEL-NEXT: kandb %k6, %k1, %k1 +; FASTISEL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kshiftlb $7, %k5, %k5 +; FASTISEL-NEXT: kshiftrb $1, %k5, %k5 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: korb %k1, %k0, %k2 +; FASTISEL-NEXT: korb %k5, %k1, %k5 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k1 ; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 @@ -3319,35 +3311,36 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftrb $7, %k7, %k7 ; FASTISEL-NEXT: korb %k1, %k7, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: kandb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k2, %k1, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k7, %k7 ; FASTISEL-NEXT: kshiftrb $5, %k7, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: korb %k7, %k1, %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k3, %k1, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 ; FASTISEL-NEXT: kshiftrb $4, %k0, %k0 ; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k2, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kshiftlb $7, %k1, %k0 ; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 @@ -3356,69 +3349,68 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kmovq %k5, %k3 -; FASTISEL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k7, %k0, %k0 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k1 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k7, %k1 ; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovq %k4, %k5 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kandb %k2, %k0, %k0 +; FASTISEL-NEXT: kmovq %k2, %k7 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 +; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 +; FASTISEL-NEXT: kshiftrb $6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 -; FASTISEL-NEXT: kshiftrb $6, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $7, %k2, %k2 +; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 +; FASTISEL-NEXT: korb %k0, %k1, %k0 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 +; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kandb %k3, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 +; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kandb %k2, %k0, %k2 +; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: korb %k1, %k2, %k1 -; FASTISEL-NEXT: kandb %k3, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $5, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k7, %k1, %k1 -; FASTISEL-NEXT: kmovq %k7, %k3 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $4, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kandb %k5, %k1, %k1 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $3, %k2, %k2 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k6, %k1, %k1 -; FASTISEL-NEXT: kmovq %k6, %k5 -; FASTISEL-NEXT: kshiftlb $7, %k7, %k2 -; FASTISEL-NEXT: kshiftrb $2, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 ; FASTISEL-NEXT: kandb %k4, %k1, %k1 -; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 -; FASTISEL-NEXT: kshiftrb $1, %k2, %k2 -; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k0, %k1, %k0 +; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 +; FASTISEL-NEXT: kshiftrb $2, %k0, %k0 +; FASTISEL-NEXT: korb %k0, %k1, %k0 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 +; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 +; FASTISEL-NEXT: korb %k1, %k0, %k0 +; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; FASTISEL-NEXT: kandb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; FASTISEL-NEXT: kshiftlb $7, %k0, %k0 @@ -3427,8 +3419,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $7, %k1, %k1 ; FASTISEL-NEXT: korb %k0, %k1, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k7, %k0, %k0 +; FASTISEL-NEXT: kandb %k6, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $5, %k1, %k1 @@ -3438,19 +3429,18 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $4, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload -; FASTISEL-NEXT: kandb %k6, %k0, %k0 +; FASTISEL-NEXT: kandb %k7, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $3, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; FASTISEL-NEXT: kandb %k5, %k0, %k0 +; FASTISEL-NEXT: kandb %k4, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k1, %k1 ; FASTISEL-NEXT: kshiftrb $2, %k1, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; FASTISEL-NEXT: korb %k1, %k0, %k0 -; FASTISEL-NEXT: kandb %k4, %k0, %k0 +; FASTISEL-NEXT: kandb %k5, %k0, %k0 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k1 ; FASTISEL-NEXT: kshiftrb $1, %k1, %k1 ; FASTISEL-NEXT: korb %k1, %k0, %k0 @@ -3461,7 +3451,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $7, %k2, %k2 ; FASTISEL-NEXT: korb %k1, %k2, %k1 -; FASTISEL-NEXT: kandb %k7, %k1, %k1 +; FASTISEL-NEXT: kandb %k6, %k1, %k1 ; FASTISEL-NEXT: kmovd %ecx, %k2 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $5, %k2, %k2 @@ -3471,17 +3461,17 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $4, %k2, %k2 ; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k6, %k1, %k1 +; FASTISEL-NEXT: kandb %k7, %k1, %k1 ; FASTISEL-NEXT: kmovd %r9d, %k2 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $3, %k2, %k2 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k5, %k1, %k1 +; FASTISEL-NEXT: kandb %k4, %k1, %k1 ; FASTISEL-NEXT: kshiftlb $7, %k3, %k2 ; FASTISEL-NEXT: kshiftrb $2, %k2, %k2 ; FASTISEL-NEXT: korb %k2, %k1, %k1 -; FASTISEL-NEXT: kandb %k4, %k1, %k1 +; FASTISEL-NEXT: kandb %k5, %k1, %k1 ; FASTISEL-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; FASTISEL-NEXT: kshiftlb $7, %k2, %k2 ; FASTISEL-NEXT: kshiftrb $1, %k2, %k2 diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll index 0caa8826e75c8..7a534721bae05 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -941,44 +941,43 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; X32-NEXT: pushl %ebx ; X32-NEXT: subl $12, %esp ; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: movl %edi, %esi -; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %edi -; X32-NEXT: leal (%edx,%esi), %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: leal (%edx,%edi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: subl %esi, %ebx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: subl %ecx, %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl %edx, %eax +; X32-NEXT: subl %edi, %eax +; X32-NEXT: movl %ebp, %edx +; X32-NEXT: subl %ecx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: imull %edx, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, %edx +; X32-NEXT: subl {{[0-9]+}}(%esp), %edx +; X32-NEXT: imull %eax, %edx +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: subl %ebx, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: subl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %ebx, %eax -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: imull %edi, %eax +; X32-NEXT: addl %edx, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: imull %ebp, %edi +; X32-NEXT: addl {{[0-9]+}}(%esp), %edx +; X32-NEXT: imull %edx, %ebp ; X32-NEXT: addl {{[0-9]+}}(%esp), %esi ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl %esi, %edi -; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %eax, %edx -; X32-NEXT: addl %edx, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: addl %eax, %ebp +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %ebx ; X32-NEXT: popl %ebp @@ -986,6 +985,7 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: +; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -998,35 +998,36 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx -; WIN64-NEXT: subl %edi, %edx -; WIN64-NEXT: leal (%rsi,%r8), %edi +; WIN64-NEXT: movl %edx, %ebp +; WIN64-NEXT: subl %edi, %ebp +; WIN64-NEXT: leal (%rsi,%r8), %edx ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r10), %r8d -; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 -; WIN64-NEXT: subl %r10d, %r9d -; WIN64-NEXT: movl %eax, %r10d -; WIN64-NEXT: subl %ecx, %r10d -; WIN64-NEXT: imull %r10d, %r9d -; WIN64-NEXT: leal (%r11,%r12), %r10d -; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 -; WIN64-NEXT: subl %r12d, %r11d -; WIN64-NEXT: imull %edx, %r11d -; WIN64-NEXT: addl %r9d, %r11d -; WIN64-NEXT: leal (%r14,%r15), %edx -; WIN64-NEXT: movl %r14d, %r9d -; WIN64-NEXT: subl %r15d, %r9d -; WIN64-NEXT: imull %esi, %r9d -; WIN64-NEXT: addl %r11d, %r9d +; WIN64-NEXT: leal (%r9,%r10), %edi +; WIN64-NEXT: movl %r9d, %r8d +; WIN64-NEXT: subl %r10d, %r8d +; WIN64-NEXT: movl %eax, %r9d +; WIN64-NEXT: subl %ecx, %r9d +; WIN64-NEXT: imull %r9d, %r8d +; WIN64-NEXT: leal (%r11,%r12), %r9d +; WIN64-NEXT: movl %r11d, %r10d +; WIN64-NEXT: subl %r12d, %r10d +; WIN64-NEXT: imull %ebp, %r10d +; WIN64-NEXT: addl %r8d, %r10d +; WIN64-NEXT: leal (%r14,%r15), %r8d +; WIN64-NEXT: movl %r14d, %r11d +; WIN64-NEXT: subl %r15d, %r11d +; WIN64-NEXT: imull %esi, %r11d +; WIN64-NEXT: addl %r10d, %r11d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %r8d, %eax -; WIN64-NEXT: imull %ebx, %r10d -; WIN64-NEXT: addl %r10d, %eax -; WIN64-NEXT: imull %edi, %edx -; WIN64-NEXT: addl %edx, %eax +; WIN64-NEXT: imull %edi, %eax +; WIN64-NEXT: imull %ebx, %r9d ; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: imull %edx, %r8d +; WIN64-NEXT: addl %r8d, %eax +; WIN64-NEXT: addl %r11d, %eax ; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX64-LABEL: testi32_inp: @@ -1040,35 +1041,35 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX64-NEXT: # kill: def $edx killed $edx killed $rdx -; LINUXOSX64-NEXT: subl %edi, %edx -; LINUXOSX64-NEXT: leal (%rsi,%r8), %edi +; LINUXOSX64-NEXT: movl %edx, %r11d +; LINUXOSX64-NEXT: subl %edi, %r11d +; LINUXOSX64-NEXT: leal (%rsi,%r8), %edx ; LINUXOSX64-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX64-NEXT: subl %r8d, %esi -; LINUXOSX64-NEXT: leal (%r9,%r12), %r8d -; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d killed $r9 -; LINUXOSX64-NEXT: subl %r12d, %r9d -; LINUXOSX64-NEXT: movl %eax, %r11d -; LINUXOSX64-NEXT: subl %ecx, %r11d -; LINUXOSX64-NEXT: imull %r11d, %r9d -; LINUXOSX64-NEXT: leal (%r13,%r14), %r11d +; LINUXOSX64-NEXT: leal (%r9,%r12), %edi +; LINUXOSX64-NEXT: movl %r9d, %r8d +; LINUXOSX64-NEXT: subl %r12d, %r8d +; LINUXOSX64-NEXT: movl %eax, %r9d +; LINUXOSX64-NEXT: subl %ecx, %r9d +; LINUXOSX64-NEXT: imull %r9d, %r8d +; LINUXOSX64-NEXT: leal (%r13,%r14), %r9d ; LINUXOSX64-NEXT: movl %r13d, %r12d ; LINUXOSX64-NEXT: subl %r14d, %r12d -; LINUXOSX64-NEXT: imull %edx, %r12d -; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %edx -; LINUXOSX64-NEXT: addl %r9d, %r12d -; LINUXOSX64-NEXT: movl %r15d, %r9d -; LINUXOSX64-NEXT: subl %edx, %r9d -; LINUXOSX64-NEXT: imull %esi, %r9d -; LINUXOSX64-NEXT: addl %r12d, %r9d +; LINUXOSX64-NEXT: imull %r11d, %r12d +; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; LINUXOSX64-NEXT: addl %r8d, %r12d +; LINUXOSX64-NEXT: movl %r15d, %r8d +; LINUXOSX64-NEXT: subl %r11d, %r8d +; LINUXOSX64-NEXT: imull %esi, %r8d +; LINUXOSX64-NEXT: addl %r12d, %r8d ; LINUXOSX64-NEXT: addl %ecx, %eax -; LINUXOSX64-NEXT: imull %r8d, %eax -; LINUXOSX64-NEXT: imull %r10d, %r11d -; LINUXOSX64-NEXT: addl %r11d, %eax -; LINUXOSX64-NEXT: addl %r15d, %edx -; LINUXOSX64-NEXT: imull %edi, %edx -; LINUXOSX64-NEXT: addl %edx, %eax +; LINUXOSX64-NEXT: imull %edi, %eax +; LINUXOSX64-NEXT: imull %r10d, %r9d ; LINUXOSX64-NEXT: addl %r9d, %eax +; LINUXOSX64-NEXT: addl %r15d, %r11d +; LINUXOSX64-NEXT: imull %edx, %r11d +; LINUXOSX64-NEXT: addl %r11d, %eax +; LINUXOSX64-NEXT: addl %r8d, %eax ; LINUXOSX64-NEXT: retq %x1 = sub i32 %a1, %a2 %x2 = sub i32 %a3, %a4 diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 9cb3ceae16f09..51ffeca52a665 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1941,45 +1941,47 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind { define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_cmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] ; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] +; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] ; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x0c] +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x10] ; X86-NEXT: popl %esi # encoding: [0x5e] +; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2114,45 +2116,47 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind { define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_x86_avx512_ucmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] +; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] ; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] +; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] ; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] ; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] +; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] ; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] ; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] +; X86-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x0c] +; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x10] ; X86-NEXT: popl %esi # encoding: [0x5e] +; X86-NEXT: popl %edi # encoding: [0x5f] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index bcae88259a92e..9daac1df1d975 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -710,7 +710,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $56, %esp +; X86-NEXT: subl $60, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -733,8 +733,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431633920, %ebp # imm = 0x55550000 ; X86-NEXT: shrl %ebx ; X86-NEXT: andl $1431633920, %ebx # imm = 0x55550000 -; X86-NEXT: leal (%ebx,%ebp,2), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: leal (%ebx,%ebp,2), %ebp ; X86-NEXT: bswapl %edi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F @@ -751,7 +750,8 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ebx # imm = 0x55555555 ; X86-NEXT: shrl %edi ; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X86-NEXT: leal (%edi,%ebx,2), %ebx +; X86-NEXT: leal (%edi,%ebx,2), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bswapl %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F @@ -768,8 +768,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 ; X86-NEXT: shrl %esi ; X86-NEXT: andl $1431655765, %esi # imm = 0x55555555 -; X86-NEXT: leal (%esi,%edi,2), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal (%esi,%edi,2), %ebx ; X86-NEXT: bswapl %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F @@ -898,8 +897,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: leal (%eax,%ecx,2), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal (%eax,%ecx,2), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -937,7 +935,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -1012,7 +1010,8 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: leal (%eax,%ecx,2), %edi +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bswapl %eax ; X86-NEXT: movl %eax, %ecx @@ -1031,16 +1030,11 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %edx -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: shrdl $16, %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $16, %eax, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shrdl $16, %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $16, %eax, %ecx +; X86-NEXT: shrdl $16, %ecx, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shrdl $16, %ebx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shrdl $16, %ecx, %eax @@ -1060,25 +1054,30 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shrdl $16, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: shrdl $16, %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: shrdl $16, %ebp, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: shrdl $16, %ebx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $16, %eax, %ebx -; X86-NEXT: shrdl $16, %edi, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shrdl $16, %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shrdl $16, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %ecx, %edi +; X86-NEXT: shrdl $16, %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 60(%eax) -; X86-NEXT: movl %ecx, 56(%eax) +; X86-NEXT: movl %ecx, 60(%eax) +; X86-NEXT: movl %edi, 56(%eax) ; X86-NEXT: movl %ebx, 52(%eax) ; X86-NEXT: movl %ebp, 48(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 44(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 40(%eax) @@ -1098,12 +1097,12 @@ define i528 @large_promotion(i528 %A) nounwind { ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: shrl $16, %edx ; X86-NEXT: movw %dx, 64(%eax) -; X86-NEXT: addl $56, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll index 314df786b3e80..cee8489e9aaea 100644 --- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll @@ -81,19 +81,18 @@ define void @func_large() !prof !0 { ; CHECK: b7 ; CHECK: b9 ; -; An expected output with chain-split-threshold=1 (disabling splitting) -- the -; increase of the layout score is smaller, ~7%: +; An expected output with chain-split-threshold=1 (disabling split point enumeration) ; ; CHECK2-LABEL: Applying ext-tsp layout ; CHECK2: original layout score: 9171074274.27 -; CHECK2: optimized layout score: 9810644873.57 +; CHECK2: optimized layout score: 10844307310.87 ; CHECK2: b0 ; CHECK2: b2 ; CHECK2: b3 ; CHECK2: b4 ; CHECK2: b5 -; CHECK2: b1 ; CHECK2: b8 +; CHECK2: b1 ; CHECK2: b6 ; CHECK2: b7 ; CHECK2: b9 diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll index bfb9885c10c4e..460251ffa6c5d 100644 --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -253,7 +253,8 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LIS-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7] ; CHECK-LIS-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1] ; CHECK-LIS-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; CHECK-LIS-NEXT: por %xmm2, %xmm0 +; CHECK-LIS-NEXT: por %xmm0, %xmm2 +; CHECK-LIS-NEXT: movdqa %xmm2, %xmm0 ; CHECK-LIS-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll index dc8c0e13edcaa..65d74c8f262a3 100644 --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,SSE2-LV -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -early-live-intervals | FileCheck %s --check-prefixes=CHECK,SSE2,SSE2-LIS +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -early-live-intervals | FileCheck %s --check-prefixes=CHECK,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,XOP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 @@ -115,56 +115,30 @@ define i32 @combine_rot_select_zero(i32, i32) { } define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) { -; SSE2-LV-LABEL: combine_vec_rot_select_zero: -; SSE2-LV: # %bb.0: -; SSE2-LV-NEXT: pxor %xmm2, %xmm2 -; SSE2-LV-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-LV-NEXT: pslld $23, %xmm1 -; SSE2-LV-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-LV-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-LV-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-LV-NEXT: movdqa %xmm0, %xmm3 -; SSE2-LV-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-LV-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; SSE2-LV-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-LV-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-LV-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-LV-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] -; SSE2-LV-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-LV-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-LV-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-LV-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-LV-NEXT: por %xmm4, %xmm3 -; SSE2-LV-NEXT: pand %xmm2, %xmm0 -; SSE2-LV-NEXT: pandn %xmm3, %xmm2 -; SSE2-LV-NEXT: por %xmm2, %xmm0 -; SSE2-LV-NEXT: retq -; -; SSE2-LIS-LABEL: combine_vec_rot_select_zero: -; SSE2-LIS: # %bb.0: -; SSE2-LIS-NEXT: pxor %xmm2, %xmm2 -; SSE2-LIS-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-LIS-NEXT: pslld $23, %xmm1 -; SSE2-LIS-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-LIS-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-LIS-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-LIS-NEXT: movdqa %xmm0, %xmm3 -; SSE2-LIS-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-LIS-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] -; SSE2-LIS-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-LIS-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-LIS-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-LIS-NEXT: por %xmm4, %xmm3 -; SSE2-LIS-NEXT: pand %xmm2, %xmm0 -; SSE2-LIS-NEXT: pandn %xmm3, %xmm2 -; SSE2-LIS-NEXT: por %xmm0, %xmm2 -; SSE2-LIS-NEXT: movdqa %xmm2, %xmm0 -; SSE2-LIS-NEXT: retq +; SSE2-LABEL: combine_vec_rot_select_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq ; ; XOP-LABEL: combine_vec_rot_select_zero: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll index 222a4d78668b6..3efd536adc4d1 100644 --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -51,26 +51,26 @@ define i96 @square_high(i96 %x) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: addl %eax, %ecx ; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: setb %al ; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 7ce11ad6abaf3..d26f4b7044cf3 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -177,104 +177,103 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $156, %esp +; X86-NEXT: subl $152, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %esi -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edi, %ebx -; X86-NEXT: sbbl %edi, %ebp -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: xorl %eax, %edi +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl %edx, %edi +; X86-NEXT: xorl %ebp, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: subl %edx, %ebp +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl -; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %edi, %ecx +; X86-NEXT: bsrl %ebx, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: bsrl %ebx, %ebx -; X86-NEXT: xorl $31, %ebx -; X86-NEXT: addl $32, %ebx ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: cmovnel %edx, %ebx -; X86-NEXT: addl $64, %ebx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: bsrl %ebp, %ebp +; X86-NEXT: xorl $31, %ebp +; X86-NEXT: addl $32, %ebp ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %edi -; X86-NEXT: cmovnel %ecx, %ebx +; X86-NEXT: testl %esi, %esi +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: addl $64, %ebp +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: cmovnel %ecx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: bsrl %ebp, %ecx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: bsrl %ebx, %esi ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: addl $64, %edx -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: orl %edi, %esi ; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: subl %edx, %ebx +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: subl %edx, %ebp ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: movl $0, %edx @@ -282,8 +281,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi ; X86-NEXT: movl $127, %ecx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebp, %ecx +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ecx @@ -291,36 +291,35 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: cmovnel %ebp, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: cmovnel %ebp, %edi -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: cmovnel %ebp, %esi -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovnel %ebx, %edi +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %eax +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.8: # %_udiv-special-cases -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl $127, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: xorl $127, %ebp +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: je .LBB4_9 ; X86-NEXT: # %bb.5: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -329,89 +328,93 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: xorb $127, %al ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 148(%esp,%eax), %edx -; X86-NEXT: movl 152(%esp,%eax), %ebx +; X86-NEXT: movsbl %al, %edi +; X86-NEXT: movl 144(%esp,%edi), %edx +; X86-NEXT: movl 148(%esp,%edi), %esi ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edx, %ebx +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 144(%esp,%eax), %ebp -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: shrl %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl 140(%esp,%eax), %eax +; X86-NEXT: movl 140(%esp,%edi), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shrl %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl 136(%esp,%edi), %esi ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %eax, %ebp -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl $1, %esi +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $0, %eax +; X86-NEXT: addl $1, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $0, %esi ; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.6: +; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_1: -; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: jmp .LBB4_9 ; X86-NEXT: .LBB4_2: # %udiv-preheader +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %al, %ch +; X86-NEXT: movb %bl, %ch ; X86-NEXT: andb $7, %ch -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $15, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 104(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 100(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movb %bl, %cl +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl %cl, %ebx +; X86-NEXT: movl 100(%esp,%ebx), %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%ebx), %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: movb %ch, %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl 92(%esp,%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%eax), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: shrdl %cl, %esi, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl 88(%esp,%ebx), %esi +; X86-NEXT: movl 92(%esp,%ebx), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: notb %cl -; X86-NEXT: addl %ebx, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; X86-NEXT: shrdl %cl, %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -421,179 +424,177 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %esi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shldl $1, %ebp, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %ebp +; X86-NEXT: shldl $1, %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %edx +; X86-NEXT: shldl $1, %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl $1, %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: orl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %ebp -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %ecx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: andl $1, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edi, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: sbbl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %edi ; X86-NEXT: adcl $-1, %edx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: jne .LBB4_3 ; X86-NEXT: # %bb.4: +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: addl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: shldl $1, %eax, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: .LBB4_9: # %udiv-end -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: xorl %eax, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: subl %eax, %esi +; X86-NEXT: shldl $1, %esi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: addl %esi, %esi +; X86-NEXT: orl %ebp, %esi +; X86-NEXT: .LBB4_9: # %udiv-end +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: subl %ebx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl %ebp, 4(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 12(%edx) -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %edi, 12(%ecx) ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl (%esp), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: imull %eax, %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: imull %esi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull %edi, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: imull %esi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %ebx +; X86-NEXT: imull %edx, %ebp ; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: subl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: addl $156, %esp +; X86-NEXT: addl $152, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -607,24 +608,24 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %rbx -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rsi, %r12 -; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: movq %rcx, %r12 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, 8(%rbx) -; X64-NEXT: movq %rax, (%rbx) -; X64-NEXT: imulq %rax, %r14 -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %r14, %rdx -; X64-NEXT: imulq %r15, %rcx +; X64-NEXT: movq %rdx, 8(%r15) +; X64-NEXT: movq %rax, (%r15) +; X64-NEXT: imulq %rax, %r12 +; X64-NEXT: mulq %r13 +; X64-NEXT: addq %r12, %rdx +; X64-NEXT: imulq %r13, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: subq %rax, %r13 -; X64-NEXT: sbbq %rcx, %r12 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r12, %rdx +; X64-NEXT: subq %rax, %r14 +; X64-NEXT: sbbq %rcx, %rbx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rbx, %rdx ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -713,23 +714,23 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: movsbl (%esp), %eax ; X86-NEXT: idivb {{[0-9]+}}(%esp) ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-NEXT: movd %edx, %xmm4 +; X86-NEXT: movd %edx, %xmm7 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm7 +; X86-NEXT: movd %esi, %xmm4 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; X86-NEXT: movd %edi, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; X86-NEXT: movd %ebx, %xmm4 +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; X86-NEXT: movd %ebx, %xmm5 ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movd %ecx, %xmm5 +; X86-NEXT: movd %ecx, %xmm6 ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; X86-NEXT: movdqa %xmm2, %xmm4 ; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; X86-NEXT: movdqa %xmm4, (%ecx) @@ -816,47 +817,47 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X64-NEXT: movd %r8d, %xmm5 ; X64-NEXT: movd %r9d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r10d, %xmm2 +; X64-NEXT: movd %r10d, %xmm7 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; X64-NEXT: movd %r11d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %ebx, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: movd %ebx, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; X64-NEXT: movd %ebp, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-NEXT: movd %r14d, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: movd %r15d, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; X64-NEXT: movd %r15d, %xmm6 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; X64-NEXT: movd %r12d, %xmm5 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; X64-NEXT: movd %r13d, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; X64-NEXT: movd %r13d, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; X64-NEXT: movd %edx, %xmm6 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movd %eax, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; X64-NEXT: movdqa %xmm6, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-NEXT: movd %eax, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-NEXT: movdqa %xmm3, %xmm4 +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movdqa %xmm2, (%rax) +; X64-NEXT: movdqa %xmm4, (%rax) +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmullw %xmm2, %xmm4 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; X64-NEXT: pand %xmm2, %xmm4 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X64-NEXT: pmullw %xmm3, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm3, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: pmullw %xmm6, %xmm1 -; X64-NEXT: pand %xmm3, %xmm1 -; X64-NEXT: packuswb %xmm2, %xmm1 +; X64-NEXT: pmullw %xmm3, %xmm1 +; X64-NEXT: pand %xmm2, %xmm1 +; X64-NEXT: packuswb %xmm4, %xmm1 ; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 454f8b7242dc2..ebb95f16a723c 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -178,13 +178,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $132, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sete %bl @@ -197,29 +197,31 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movb %al, (%esp) # 1-byte Spill ; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %ebp, %ecx +; X86-NEXT: bsrl %edi, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bsrl %eax, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: addl $32, %eax -; X86-NEXT: testl %edi, %edi -; X86-NEXT: cmovnel %edx, %eax -; X86-NEXT: addl $64, %eax -; X86-NEXT: orl %esi, %ebp -; X86-NEXT: cmovnel %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: bsrl %ebp, %ebp +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl $31, %ebp +; X86-NEXT: addl $32, %ebp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: addl $64, %ebp +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: cmovnel %ecx, %ebp +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: bsrl %ebx, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: bsrl %edi, %esi @@ -228,67 +230,69 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %edi, %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: addl $64, %edx ; X86-NEXT: movl %ebx, %esi -; X86-NEXT: orl %ebp, %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovnel %ecx, %edx ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: subl %edx, %ebp ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl $0, %edi ; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl $127, %edx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebp, %edx ; X86-NEXT: movl $0, %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: movl $0, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: movl $0, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edi, %edx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %ebx, %edx ; X86-NEXT: setb %dl ; X86-NEXT: orb (%esp), %dl # 1-byte Folded Reload -; X86-NEXT: cmovnel %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovnel %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovnel %ecx, %esi +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmovnel %ecx, %ebp ; X86-NEXT: jne .LBB4_8 ; X86-NEXT: # %bb.1: # %_udiv-special-cases -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: xorl $127, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %edi, %ebp ; X86-NEXT: je .LBB4_8 ; X86-NEXT: # %bb.2: # %udiv-bb1 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: xorb $127, %al ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch @@ -297,69 +301,69 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax ; X86-NEXT: movl 124(%esp,%eax), %edx -; X86-NEXT: movl 128(%esp,%eax), %edi +; X86-NEXT: movl 128(%esp,%eax), %esi ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 120(%esp,%eax), %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shrl %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl 116(%esp,%eax), %edx +; X86-NEXT: movl 120(%esp,%eax), %ebp +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: shrl %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl 116(%esp,%eax), %ebp ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: addl $1, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: shldl %cl, %ebp, %edx +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: addl $1, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: jae .LBB4_3 ; X86-NEXT: # %bb.6: ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_3: # %udiv-preheader -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 80(%esp,%eax), %esi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %ebp +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 76(%esp,%eax), %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %esi, %edx -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%eax), %edx -; X86-NEXT: movl 72(%esp,%eax), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: shrdl %cl, %ebp, %ebx +; X86-NEXT: movl 68(%esp,%eax), %esi +; X86-NEXT: movl 72(%esp,%eax), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: notb %cl ; X86-NEXT: addl %edi, %edi @@ -367,59 +371,56 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: orl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: shrdl %cl, %ebp, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %ebp +; X86-NEXT: shrdl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_4: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: shldl $1, %ebx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %edx +; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ebx +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %ebp -; X86-NEXT: orl %eax, %ebp +; X86-NEXT: orl %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl $1, %eax, %ecx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax @@ -427,122 +428,123 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: andl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl %ecx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx -; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl $-1, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $-1, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: jne .LBB4_4 ; X86-NEXT: # %bb.5: -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: shldl $1, %ebx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: shldl $1, %edx, %ebx +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: shldl $1, %ebx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: shldl $1, %ebp, %ebx ; X86-NEXT: orl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edx, %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: addl %ebp, %ebp +; X86-NEXT: orl %ecx, %ebp +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: .LBB4_8: # %udiv-end -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ebp, 12(%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %esi, 8(%ecx) +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: imull %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull %edx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: imull %ecx, %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %esi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %edi, %ecx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebp ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %edi -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: addl (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: imull %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: subl (%esp), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: addl $132, %esp ; X86-NEXT: popl %esi @@ -558,24 +560,24 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %rbx -; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rsi, %r12 -; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: movq %rcx, %r12 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdi, %r14 ; X64-NEXT: callq __udivti3@PLT ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdx, 8(%rbx) -; X64-NEXT: movq %rax, (%rbx) -; X64-NEXT: imulq %rax, %r14 -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %r14, %rdx -; X64-NEXT: imulq %r15, %rcx +; X64-NEXT: movq %rdx, 8(%r15) +; X64-NEXT: movq %rax, (%r15) +; X64-NEXT: imulq %rax, %r12 +; X64-NEXT: mulq %r13 +; X64-NEXT: addq %r12, %rdx +; X64-NEXT: imulq %r13, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: subq %rax, %r13 -; X64-NEXT: sbbq %rcx, %r12 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r12, %rdx +; X64-NEXT: subq %rax, %r14 +; X64-NEXT: sbbq %rcx, %rbx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rbx, %rdx ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -664,23 +666,23 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X86-NEXT: movzbl (%esp), %eax ; X86-NEXT: divb {{[0-9]+}}(%esp) ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-NEXT: movd %edx, %xmm4 +; X86-NEXT: movd %edx, %xmm7 ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X86-NEXT: movd %esi, %xmm7 +; X86-NEXT: movd %esi, %xmm4 ; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; X86-NEXT: movd %edi, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; X86-NEXT: movd %ebx, %xmm4 +; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; X86-NEXT: movd %ebx, %xmm5 ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movd %ecx, %xmm5 +; X86-NEXT: movd %ecx, %xmm6 ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; X86-NEXT: movdqa %xmm2, %xmm4 ; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; X86-NEXT: movdqa %xmm4, (%ecx) @@ -767,47 +769,47 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi ; X64-NEXT: movd %r8d, %xmm5 ; X64-NEXT: movd %r9d, %xmm6 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movd %r10d, %xmm2 +; X64-NEXT: movd %r10d, %xmm7 ; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; X64-NEXT: movd %r11d, %xmm4 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; X64-NEXT: movd %ebx, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; X64-NEXT: movd %ebp, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: movd %ebx, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; X64-NEXT: movd %ebp, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-NEXT: movd %r14d, %xmm4 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: movd %r15d, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; X64-NEXT: movd %r15d, %xmm6 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; X64-NEXT: movd %r12d, %xmm5 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; X64-NEXT: movd %r13d, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; X64-NEXT: movd %r13d, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; X64-NEXT: movd %edx, %xmm6 ; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: movd %eax, %xmm6 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; X64-NEXT: movdqa %xmm6, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; X64-NEXT: movd %eax, %xmm3 +; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-NEXT: movdqa %xmm3, %xmm4 +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movdqa %xmm2, (%rax) +; X64-NEXT: movdqa %xmm4, (%rax) +; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmullw %xmm2, %xmm4 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; X64-NEXT: pand %xmm2, %xmm4 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X64-NEXT: pmullw %xmm3, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm3, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: pmullw %xmm6, %xmm1 -; X64-NEXT: pand %xmm3, %xmm1 -; X64-NEXT: packuswb %xmm2, %xmm1 +; X64-NEXT: pmullw %xmm3, %xmm1 +; X64-NEXT: pand %xmm2, %xmm1 +; X64-NEXT: packuswb %xmm4, %xmm1 ; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll index 0ce461f648e45..03de1533e1d64 100644 --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -1129,10 +1129,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill @@ -1174,10 +1174,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1187,10 +1187,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1200,10 +1200,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] @@ -1213,10 +1213,10 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll index a8636a3496dc4..5ea2964057588 100644 --- a/llvm/test/CodeGen/X86/fold-tied-op.ll +++ b/llvm/test/CodeGen/X86/fold-tied-op.ll @@ -24,46 +24,45 @@ define i64 @fn1() #0 { ; CHECK-NEXT: .cfi_offset %esi, -20 ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 -; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D -; CHECK-NEXT: movl $668265295, %ecx # imm = 0x27D4EB4F +; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D +; CHECK-NEXT: movl $668265295, %esi # imm = 0x27D4EB4F ; CHECK-NEXT: movl a, %edi ; CHECK-NEXT: cmpl $0, (%edi) ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl 8(%edi), %esi -; CHECK-NEXT: movl 12(%edi), %eax -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shldl $1, %esi, %edx -; CHECK-NEXT: orl %eax, %edx -; CHECK-NEXT: leal (%esi,%esi), %eax -; CHECK-NEXT: orl %esi, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 8(%edi), %ecx +; CHECK-NEXT: movl 12(%edi), %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: shldl $1, %ecx, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: leal (%ecx,%ecx), %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl 16(%edi), %ebx -; CHECK-NEXT: movl 20(%edi), %esi -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: shldl $2, %ebx, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %ebx, %eax -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: shldl $31, %eax, %ebx -; CHECK-NEXT: shll $2, %eax -; CHECK-NEXT: orl %ebx, %eax -; CHECK-NEXT: shrl %esi -; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: adcl %edx, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 20(%edi), %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: shldl $2, %ebx, %edx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: shldl $31, %ebx, %ecx +; CHECK-NEXT: shll $2, %ebx +; CHECK-NEXT: orl %ecx, %ebx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: shrl %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: adcl %eax, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl 24(%edi), %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D ; CHECK-NEXT: imull %eax, %ebx -; CHECK-NEXT: mull %ecx -; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: mull %esi +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: addl %ebx, %edx ; CHECK-NEXT: movl 28(%edi), %edi -; CHECK-NEXT: imull %edi, %ecx -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull %edi, %esi +; CHECK-NEXT: addl %edx, %esi ; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; CHECK-NEXT: movl %ebx, %eax @@ -72,17 +71,17 @@ define i64 @fn1() #0 { ; CHECK-NEXT: addl %edx, %ebx ; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E ; CHECK-NEXT: addl %ebx, %edx -; CHECK-NEXT: shrdl $3, %ecx, %esi -; CHECK-NEXT: sarl $3, %ecx -; CHECK-NEXT: orl %edx, %ecx -; CHECK-NEXT: orl %eax, %esi +; CHECK-NEXT: shrdl $3, %esi, %ecx +; CHECK-NEXT: sarl $3, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87 -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: mull %ebx ; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: imull $326129324, %esi, %eax # imm = 0x137056AC +; CHECK-NEXT: imull $326129324, %ecx, %eax # imm = 0x137056AC ; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 +; CHECK-NEXT: imull $-66860409, %esi, %ecx # imm = 0xFC03CA87 ; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload @@ -95,14 +94,14 @@ define i64 @fn1() #0 { ; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.else -; CHECK-NEXT: xorl b+4, %ebx -; CHECK-NEXT: xorl b, %ecx +; CHECK-NEXT: xorl b+4, %ecx +; CHECK-NEXT: xorl b, %esi ; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87 -; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: mull %edx -; CHECK-NEXT: imull $93298681, %ecx, %esi # imm = 0x58F9FF9 +; CHECK-NEXT: imull $93298681, %esi, %esi # imm = 0x58F9FF9 ; CHECK-NEXT: addl %edx, %esi -; CHECK-NEXT: imull $1419758215, %ebx, %ecx # imm = 0x549FCA87 +; CHECK-NEXT: imull $1419758215, %ecx, %ecx # imm = 0x549FCA87 ; CHECK-NEXT: .LBB0_3: # %if.end ; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: addl $-1028477341, %eax # imm = 0xC2B2AE63 diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll index 5cd9281929a32..2856cfa01fad1 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -1212,10 +1212,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: movq %r14, %rbp ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rdx ; CHECK-NEXT: cmovaq %r13, %rax -; CHECK-NEXT: movq $-1, %r13 +; CHECK-NEXT: movq $-1, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1230,11 +1231,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbp, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rdx -; CHECK-NEXT: cmovaq %r13, %rax -; CHECK-NEXT: movq $-1, %r13 +; CHECK-NEXT: cmovaq %r14, %rax +; CHECK-NEXT: movq $-1, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1249,10 +1250,12 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: cmovbq %r14, %rdx +; CHECK-NEXT: cmovbq %rbp, %rdx +; CHECK-NEXT: movq %rbp, %r13 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rdx -; CHECK-NEXT: cmovaq %r13, %rax +; CHECK-NEXT: cmovaq %r14, %rax +; CHECK-NEXT: movq $-1, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1268,12 +1271,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: cmovbq %r14, %rbp -; CHECK-NEXT: movq %r14, %r13 +; CHECK-NEXT: cmovbq %r13, %rbp ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r15, %rbp -; CHECK-NEXT: movq $-1, %rcx -; CHECK-NEXT: cmovaq %rcx, %rax +; CHECK-NEXT: movq %r15, %r13 +; CHECK-NEXT: cmovaq %r14, %rax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %r12, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1289,10 +1291,10 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r12, %r14 -; CHECK-NEXT: cmovbq %r13, %r15 +; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; CHECK-NEXT: cmovbq %rax, %r15 ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmovaq %rax, %r15 +; CHECK-NEXT: cmovaq %r13, %r15 ; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: cmovaq %rax, %r14 ; CHECK-NEXT: ucomiss %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll index 36bf74f573155..065b396e82ec3 100644 --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -267,8 +267,8 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: pushl %ebx ; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -276,16 +276,16 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: jne .LBB6_1 ; X86-FAST-NEXT: # %bb.2: ; X86-FAST-NEXT: movl %ebx, %ebp -; X86-FAST-NEXT: movl %edx, %ebx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movl %edi, %eax -; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: movl %esi, %ebx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl %edi, %eax +; X86-FAST-NEXT: movl %edx, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: je .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %edx, %esi -; X86-FAST-NEXT: movl %edi, %edx +; X86-FAST-NEXT: movl %esi, %edx +; X86-FAST-NEXT: movl %edi, %esi ; X86-FAST-NEXT: movl %ebx, %edi ; X86-FAST-NEXT: movl %eax, %ebx ; X86-FAST-NEXT: jmp .LBB6_6 @@ -301,12 +301,12 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: shldl %cl, %ebp, %eax ; X86-FAST-NEXT: movl %edi, %ebp ; X86-FAST-NEXT: shldl %cl, %ebx, %ebp -; X86-FAST-NEXT: movl %edx, %ebx +; X86-FAST-NEXT: movl %esi, %ebx ; X86-FAST-NEXT: shldl %cl, %edi, %ebx ; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-FAST-NEXT: shldl %cl, %edx, %esi +; X86-FAST-NEXT: shldl %cl, %esi, %edx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: movl %esi, 12(%ecx) +; X86-FAST-NEXT: movl %edx, 12(%ecx) ; X86-FAST-NEXT: movl %ebx, 8(%ecx) ; X86-FAST-NEXT: movl %ebp, 4(%ecx) ; X86-FAST-NEXT: movl %eax, (%ecx) @@ -324,26 +324,26 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: pushl %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SLOW-NEXT: testb $64, %al ; X86-SLOW-NEXT: jne .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: ; X86-SLOW-NEXT: movl %edx, %ebp -; X86-SLOW-NEXT: movl %edi, %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: movl %esi, %ebx +; X86-SLOW-NEXT: movl %ebx, %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl %edi, %ecx +; X86-SLOW-NEXT: movl %esi, %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: testb $32, %al ; X86-SLOW-NEXT: je .LBB6_5 ; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebx, %edi -; X86-SLOW-NEXT: movl %edx, %ebx +; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, %ebx +; X86-SLOW-NEXT: movl %edx, %edi ; X86-SLOW-NEXT: movl %ecx, %edx ; X86-SLOW-NEXT: jmp .LBB6_6 ; X86-SLOW-NEXT: .LBB6_1: @@ -364,30 +364,30 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: shrl %cl, %ebp ; X86-SLOW-NEXT: orl %esi, %ebp -; X86-SLOW-NEXT: movl %ebx, %esi +; X86-SLOW-NEXT: movl %edi, %esi ; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: shll %cl, %esi ; X86-SLOW-NEXT: shrl %edx ; X86-SLOW-NEXT: movb %ch, %cl ; X86-SLOW-NEXT: shrl %cl, %edx ; X86-SLOW-NEXT: orl %esi, %edx -; X86-SLOW-NEXT: movl %edi, %esi +; X86-SLOW-NEXT: movl %ebx, %esi ; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: shrl %ebx +; X86-SLOW-NEXT: shrl %edi ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %ebx -; X86-SLOW-NEXT: orl %esi, %ebx +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: orl %esi, %edi ; X86-SLOW-NEXT: movb %al, %cl ; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: shrl %ebx ; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: orl %eax, %edi +; X86-SLOW-NEXT: shrl %cl, %ebx +; X86-SLOW-NEXT: orl %eax, %ebx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl %edi, 12(%eax) -; X86-SLOW-NEXT: movl %ebx, 8(%eax) +; X86-SLOW-NEXT: movl %ebx, 12(%eax) +; X86-SLOW-NEXT: movl %edi, 8(%eax) ; X86-SLOW-NEXT: movl %edx, 4(%eax) ; X86-SLOW-NEXT: movl %ebp, (%eax) ; X86-SLOW-NEXT: addl $4, %esp diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll index 367a3dddb8640..4340f8fd484ae 100644 --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -263,19 +263,19 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: pushl %esi ; X86-FAST-NEXT: pushl %eax ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-FAST-NEXT: testb $64, %cl ; X86-FAST-NEXT: je .LBB6_1 ; X86-FAST-NEXT: # %bb.2: ; X86-FAST-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-FAST-NEXT: movl %esi, %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl %edi, %ebp -; X86-FAST-NEXT: movl %ebx, %edi +; X86-FAST-NEXT: movl %edi, %edx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl %esi, %ebp +; X86-FAST-NEXT: movl %ebx, %esi ; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: je .LBB6_4 @@ -287,19 +287,19 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: jne .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %esi, %ebx -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: movl %edx, %edi +; X86-FAST-NEXT: movl %edi, %ebx +; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: movl %edx, %esi ; X86-FAST-NEXT: movl %ebp, %edx ; X86-FAST-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-FAST-NEXT: .LBB6_5: ; X86-FAST-NEXT: shrdl %cl, %edx, %ebp -; X86-FAST-NEXT: shrdl %cl, %edi, %edx -; X86-FAST-NEXT: shrdl %cl, %esi, %edi +; X86-FAST-NEXT: shrdl %cl, %esi, %edx +; X86-FAST-NEXT: shrdl %cl, %edi, %esi ; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-FAST-NEXT: shrdl %cl, %ebx, %esi -; X86-FAST-NEXT: movl %esi, 12(%eax) -; X86-FAST-NEXT: movl %edi, 8(%eax) +; X86-FAST-NEXT: shrdl %cl, %ebx, %edi +; X86-FAST-NEXT: movl %edi, 12(%eax) +; X86-FAST-NEXT: movl %esi, 8(%eax) ; X86-FAST-NEXT: movl %edx, 4(%eax) ; X86-FAST-NEXT: movl %ebp, (%eax) ; X86-FAST-NEXT: addl $4, %esp @@ -317,9 +317,9 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: subl $8, %esp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: testb $64, %cl ; X86-SLOW-NEXT: je .LBB6_1 @@ -327,15 +327,15 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: movl %ebp, %eax ; X86-SLOW-NEXT: movl %ebx, %ebp ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl %esi, %edx -; X86-SLOW-NEXT: movl %edi, %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %edi, %edx +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SLOW-NEXT: testb $32, %cl ; X86-SLOW-NEXT: jne .LBB6_5 ; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %ebx, %edi -; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebp, %esi +; X86-SLOW-NEXT: movl %ebx, %esi +; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, %edi ; X86-SLOW-NEXT: movl %edx, %ebp ; X86-SLOW-NEXT: movl %eax, %edx ; X86-SLOW-NEXT: jmp .LBB6_6 @@ -357,28 +357,28 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: leal (%esi,%esi), %edx +; X86-SLOW-NEXT: leal (%edi,%edi), %edx ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shll %cl, %edx ; X86-SLOW-NEXT: orl %ebp, %edx ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-SLOW-NEXT: leal (%esi,%esi), %ebp +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-SLOW-NEXT: leal (%edi,%edi), %ebp ; X86-SLOW-NEXT: movl %ebx, %ecx ; X86-SLOW-NEXT: shll %cl, %ebp ; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %esi -; X86-SLOW-NEXT: addl %edi, %edi +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: addl %esi, %esi ; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %edi -; X86-SLOW-NEXT: orl %esi, %edi +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: orl %edi, %esi ; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: movl %edi, 12(%ecx) +; X86-SLOW-NEXT: movl %esi, 12(%ecx) ; X86-SLOW-NEXT: movl %ebp, 8(%ecx) ; X86-SLOW-NEXT: movl %edx, 4(%ecx) ; X86-SLOW-NEXT: movl %eax, (%ecx) diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index ec916148e8e20..4123890ed1a76 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -980,15 +980,15 @@ define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind { ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: leal (%eax,%eax,2), %edx -; X86-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi -; X86-SSE2-NEXT: movl 4(%ecx,%edx,4), %edi -; X86-SSE2-NEXT: shrdl $8, %esi, %edi +; X86-SSE2-NEXT: leal (%eax,%eax,2), %esi +; X86-SSE2-NEXT: movzwl 8(%ecx,%esi,4), %edx +; X86-SSE2-NEXT: movl 4(%ecx,%esi,4), %edi +; X86-SSE2-NEXT: shrdl $8, %edx, %edi ; X86-SSE2-NEXT: xorl %eax, %edi ; X86-SSE2-NEXT: sarl $31, %eax -; X86-SSE2-NEXT: movzbl 10(%ecx,%edx,4), %ecx +; X86-SSE2-NEXT: movzbl 10(%ecx,%esi,4), %ecx ; X86-SSE2-NEXT: shll $16, %ecx -; X86-SSE2-NEXT: orl %esi, %ecx +; X86-SSE2-NEXT: orl %edx, %ecx ; X86-SSE2-NEXT: shll $8, %ecx ; X86-SSE2-NEXT: movl %ecx, %edx ; X86-SSE2-NEXT: sarl $8, %edx diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index 1f64590ec03c8..bca446fa8fb56 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -684,10 +684,10 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: pextrw $6, %xmm3, %r13d ; SSE3-NEXT: pextrw $7, %xmm3, %eax ; SSE3-NEXT: addl %r13d, %eax -; SSE3-NEXT: movd %r12d, %xmm2 -; SSE3-NEXT: movd %r14d, %xmm3 +; SSE3-NEXT: movd %r12d, %xmm4 +; SSE3-NEXT: movd %r14d, %xmm2 ; SSE3-NEXT: movd %ebx, %xmm5 -; SSE3-NEXT: movd %r10d, %xmm4 +; SSE3-NEXT: movd %r10d, %xmm3 ; SSE3-NEXT: movd %r8d, %xmm6 ; SSE3-NEXT: movd %esi, %xmm7 ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload @@ -702,13 +702,13 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: movd %r9d, %xmm14 ; SSE3-NEXT: movd %edi, %xmm15 ; SSE3-NEXT: movd %edx, %xmm1 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] @@ -1296,10 +1296,10 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: pextrw $6, %xmm3, %r13d ; SSE3-NEXT: pextrw $7, %xmm3, %eax ; SSE3-NEXT: addl %r13d, %eax -; SSE3-NEXT: movd %r12d, %xmm2 -; SSE3-NEXT: movd %r15d, %xmm3 +; SSE3-NEXT: movd %r12d, %xmm4 +; SSE3-NEXT: movd %r15d, %xmm2 ; SSE3-NEXT: movd %r14d, %xmm5 -; SSE3-NEXT: movd %ebp, %xmm4 +; SSE3-NEXT: movd %ebp, %xmm3 ; SSE3-NEXT: movd %r10d, %xmm6 ; SSE3-NEXT: movd %r9d, %xmm7 ; SSE3-NEXT: movd %esi, %xmm8 @@ -1314,13 +1314,13 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind { ; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero ; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero -; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll index 348c2d0d5966f..1a2aac657d30f 100644 --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -364,17 +364,17 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind { ; X86-BMI2: # %bb.0: ; X86-BMI2-NEXT: pushl %esi ; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-BMI2-NEXT: movl $1, %eax -; X86-BMI2-NEXT: xorl %edx, %edx +; X86-BMI2-NEXT: movl $1, %edx ; X86-BMI2-NEXT: xorl %esi, %esi -; X86-BMI2-NEXT: shldl %cl, %eax, %esi -; X86-BMI2-NEXT: shlxl %ecx, %eax, %eax +; X86-BMI2-NEXT: xorl %eax, %eax +; X86-BMI2-NEXT: shldl %cl, %edx, %eax +; X86-BMI2-NEXT: shlxl %ecx, %edx, %edx ; X86-BMI2-NEXT: testb $32, %cl -; X86-BMI2-NEXT: cmovnel %eax, %esi ; X86-BMI2-NEXT: cmovnel %edx, %eax -; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %esi +; X86-BMI2-NEXT: cmovnel %esi, %edx ; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-BMI2-NEXT: orl %esi, %eax +; X86-BMI2-NEXT: andl {{[0-9]+}}(%esp), %edx +; X86-BMI2-NEXT: orl %eax, %edx ; X86-BMI2-NEXT: sete %al ; X86-BMI2-NEXT: popl %esi ; X86-BMI2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index e1d2f9c343855..a026757a0264d 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -546,27 +546,26 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 -; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1] -; SSSE3-FAST-NEXT: addps %xmm5, %xmm4 -; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm5, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm2 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm3 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm2, %xmm3 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32: diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index fd806e56af080..9f58ed0843348 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -14,21 +14,21 @@ define i64 @foo(i64 %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %ebp, %eax +; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %ebx, %eax ; X86-NOBMI-NEXT: mull %esi ; X86-NOBMI-NEXT: movl %edx, %esi -; X86-NOBMI-NEXT: movl %eax, %ebp -; X86-NOBMI-NEXT: addl %ebx, %ebp +; X86-NOBMI-NEXT: movl %eax, %ebx +; X86-NOBMI-NEXT: addl %ebp, %ebx ; X86-NOBMI-NEXT: adcl $0, %esi ; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: mull %ecx -; X86-NOBMI-NEXT: addl %ebp, %eax +; X86-NOBMI-NEXT: addl %ebx, %eax ; X86-NOBMI-NEXT: adcl %edx, %esi ; X86-NOBMI-NEXT: setb %al ; X86-NOBMI-NEXT: movzbl %al, %edi @@ -105,7 +105,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: pushl %ebx ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: subl $24, %esp +; X86-NOBMI-NEXT: subl $20, %esp ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: orl %ecx, %eax @@ -121,44 +121,44 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %esi +; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi ; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx ; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %esi, %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: mull %edi -; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: movl %edx, %ebp ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %edi -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, %ebx -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl $0, %ebp -; X86-NOBMI-NEXT: movl %esi, %eax +; X86-NOBMI-NEXT: mull %esi +; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ebp, %esi +; X86-NOBMI-NEXT: adcl $0, %ebx +; X86-NOBMI-NEXT: movl %edi, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %edi -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebx, %esi -; X86-NOBMI-NEXT: adcl %ebp, %edi +; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %eax, %edi +; X86-NOBMI-NEXT: addl %esi, %edi +; X86-NOBMI-NEXT: adcl %ebx, %ebp ; X86-NOBMI-NEXT: setb %bl ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %edi, %eax -; X86-NOBMI-NEXT: movzbl %bl, %edi +; X86-NOBMI-NEXT: addl %ebp, %eax +; X86-NOBMI-NEXT: movzbl %bl, %esi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: adcl %edi, %edx +; X86-NOBMI-NEXT: adcl %esi, %edx ; X86-NOBMI-NEXT: movl %ecx, %ebx ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NOBMI-NEXT: adcl $0, %eax ; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOBMI-NEXT: movl %ecx, (%edi,%ebx,8) +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) ; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %esi, 4(%edi,%ebx,8) +; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) ; X86-NOBMI-NEXT: addl $1, %ecx ; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-NOBMI-NEXT: adcl $0, %edi @@ -171,7 +171,7 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax ; X86-NOBMI-NEXT: xorl %edx, %edx -; X86-NOBMI-NEXT: addl $24, %esp +; X86-NOBMI-NEXT: addl $20, %esp ; X86-NOBMI-NEXT: popl %esi ; X86-NOBMI-NEXT: popl %edi ; X86-NOBMI-NEXT: popl %ebx @@ -209,14 +209,14 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: mulxl %eax, %edx, %edi ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %eax, %esi -; X86-BMI-NEXT: addl %edi, %eax -; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: mulxl %eax, %esi, %eax +; X86-BMI-NEXT: addl %edi, %esi +; X86-BMI-NEXT: adcl $0, %eax ; X86-BMI-NEXT: movl %ecx, %edx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %eax, %edi -; X86-BMI-NEXT: adcl %esi, %ebp +; X86-BMI-NEXT: addl %esi, %edi +; X86-BMI-NEXT: adcl %eax, %ebp ; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax ; X86-BMI-NEXT: setb %dl diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll index 2c24db9afb54a..cf423227f23bc 100644 --- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -87,34 +87,35 @@ define <2 x i256> @test_srl(<2 x i256> %In) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: shldl $28, %edx, %ebx -; X32-NEXT: shldl $28, %esi, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: shldl $28, %ecx, %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: shldl $28, %edi, %ecx +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: shldl $28, %edx, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: shldl $28, %ebx, %edx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: shldl $28, %ecx, %ebx +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shldl $28, %edi, %esi ; X32-NEXT: shldl $28, %eax, %edi -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl %eax, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shldl $28, %eax, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: shrdl $4, %eax, %edx +; X32-NEXT: shldl $28, %eax, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: shrdl $4, %eax, %ecx ; X32-NEXT: shrl $4, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %ebp, 60(%eax) -; X32-NEXT: movl %ebx, 56(%eax) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, 52(%eax) -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 56(%eax) +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 52(%eax) ; X32-NEXT: movl %ebx, 48(%eax) -; X32-NEXT: movl %ecx, 44(%eax) +; X32-NEXT: movl %esi, 44(%eax) ; X32-NEXT: movl %edi, 40(%eax) -; X32-NEXT: movl %esi, 36(%eax) -; X32-NEXT: movl %edx, 32(%eax) +; X32-NEXT: movl %edx, 36(%eax) +; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrl $31, %ecx ; X32-NEXT: movl %ecx, (%eax) @@ -182,34 +183,35 @@ define <2 x i256> @test_sra(<2 x i256> %In) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: shldl $26, %edx, %ebx -; X32-NEXT: shldl $26, %esi, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: shldl $26, %ecx, %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: shldl $26, %edi, %ecx +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: shldl $26, %edx, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: shldl $26, %ebx, %edx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: shldl $26, %ecx, %ebx +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: shldl $26, %edi, %esi ; X32-NEXT: shldl $26, %eax, %edi -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl %eax, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shldl $26, %eax, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: shrdl $6, %eax, %edx +; X32-NEXT: shldl $26, %eax, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: shrdl $6, %eax, %ecx ; X32-NEXT: sarl $6, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %ebp, 60(%eax) -; X32-NEXT: movl %ebx, 56(%eax) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, 52(%eax) -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 56(%eax) +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 52(%eax) ; X32-NEXT: movl %ebx, 48(%eax) -; X32-NEXT: movl %ecx, 44(%eax) +; X32-NEXT: movl %esi, 44(%eax) ; X32-NEXT: movl %edi, 40(%eax) -; X32-NEXT: movl %esi, 36(%eax) -; X32-NEXT: movl %edx, 32(%eax) +; X32-NEXT: movl %edx, 36(%eax) +; X32-NEXT: movl %ecx, 32(%eax) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: sarl $31, %ecx ; X32-NEXT: movl %ecx, 28(%eax) diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll index bed0e3a24a017..f84960485840d 100644 --- a/llvm/test/CodeGen/X86/machine-cp.ll +++ b/llvm/test/CodeGen/X86/machine-cp.ll @@ -100,14 +100,14 @@ define <16 x float> @foo(<16 x float> %x) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: xorps %xmm5, %xmm5 -; CHECK-NEXT: cvttps2dq %xmm3, %xmm7 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm8 ; CHECK-NEXT: movaps %xmm3, %xmm4 ; CHECK-NEXT: cmpltps %xmm5, %xmm4 -; CHECK-NEXT: movaps {{.*#+}} xmm8 = [13,14,15,16] +; CHECK-NEXT: movaps {{.*#+}} xmm7 = [13,14,15,16] ; CHECK-NEXT: movaps %xmm4, %xmm6 -; CHECK-NEXT: orps %xmm8, %xmm6 -; CHECK-NEXT: cvtdq2ps %xmm7, %xmm3 -; CHECK-NEXT: andps %xmm8, %xmm3 +; CHECK-NEXT: orps %xmm7, %xmm6 +; CHECK-NEXT: cvtdq2ps %xmm8, %xmm3 +; CHECK-NEXT: andps %xmm7, %xmm3 ; CHECK-NEXT: andps %xmm6, %xmm3 ; CHECK-NEXT: andnps %xmm4, %xmm6 ; CHECK-NEXT: cvttps2dq %xmm2, %xmm4 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index e07312c902d19..38abaf8ff11c6 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -183,36 +183,36 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm6 ; SSE4-NEXT: pxor %xmm7, %xmm7 -; SSE4-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm10 = [2147483647,2147483647] +; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm8 +; SSE4-NEXT: movdqa %xmm10, %xmm8 ; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm10 -; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm10 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm10, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm6, %xmm0 -; SSE4-NEXT: movdqa %xmm9, %xmm3 +; SSE4-NEXT: movdqa %xmm10, %xmm3 ; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE4-NEXT: movdqa %xmm9, %xmm0 +; SSE4-NEXT: movdqa %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10 ; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE4-NEXT: movapd %xmm9, %xmm0 +; SSE4-NEXT: movapd %xmm10, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm6 -; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm6 ; SSE4-NEXT: movapd %xmm3, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm2 ; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] -; SSE4-NEXT: movapd %xmm10, %xmm0 +; SSE4-NEXT: movapd %xmm9, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: movdqa %xmm1, %xmm3 -; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3 ; SSE4-NEXT: movapd %xmm8, %xmm0 ; SSE4-NEXT: pcmpgtq %xmm1, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1 diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index f2a3094b6b2d8..c8c5afbd579df 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -1637,14 +1637,13 @@ entry: define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwind { ; SSE-LABEL: test_mul8x8_f32: ; SSE: # %bb.0: # %entry -; SSE-NEXT: subq $88, %rsp -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: subq $120, %rsp ; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm9 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 @@ -1654,10 +1653,10 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm0, %xmm15 ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] -; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movaps %xmm3, %xmm10 ; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: mulps %xmm0, %xmm8 -; SSE-NEXT: addps %xmm5, %xmm8 +; SSE-NEXT: mulps %xmm0, %xmm10 +; SSE-NEXT: addps %xmm5, %xmm10 ; SSE-NEXT: mulps %xmm2, %xmm0 ; SSE-NEXT: addps %xmm15, %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 @@ -1668,10 +1667,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulps %xmm11, %xmm1 -; SSE-NEXT: addps %xmm8, %xmm1 +; SSE-NEXT: addps %xmm10, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3,3,3] -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: mulps %xmm14, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm6, %xmm14 @@ -1685,7 +1683,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 @@ -1702,56 +1700,56 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm5, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulps %xmm14, %xmm5 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: addps %xmm2, %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: mulps %xmm7, %xmm3 -; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: mulps %xmm13, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm11, %xmm8 ; SSE-NEXT: mulps %xmm11, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] @@ -1765,63 +1763,60 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulps %xmm14, %xmm4 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: mulps %xmm13, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: mulps %xmm11, %xmm1 +; SSE-NEXT: mulps %xmm8, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movaps %xmm7, %xmm5 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulps %xmm14, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: mulps %xmm11, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 @@ -1835,96 +1830,91 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm10, %xmm9 -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: mulps %xmm3, %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 -; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: mulps %xmm10, %xmm1 ; SSE-NEXT: mulps %xmm13, %xmm3 -; SSE-NEXT: movaps %xmm13, %xmm10 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm14, %xmm13 -; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: mulps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: mulps %xmm8, %xmm1 ; SSE-NEXT: addps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 -; SSE-NEXT: mulps %xmm11, %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm11[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm10[0,0] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: mulps %xmm14, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm11[1,1] -; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: mulps %xmm0, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[2,2] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[2,2] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: mulps %xmm11, %xmm1 ; SSE-NEXT: addps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3,3,3] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: mulps %xmm10, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: addps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: addps %xmm2, %xmm10 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] ; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE-NEXT: movaps %xmm12, %xmm7 ; SSE-NEXT: mulps %xmm3, %xmm7 ; SSE-NEXT: addps %xmm2, %xmm7 -; SSE-NEXT: mulps %xmm8, %xmm1 -; SSE-NEXT: mulps %xmm10, %xmm3 +; SSE-NEXT: mulps %xmm15, %xmm1 +; SSE-NEXT: mulps %xmm13, %xmm3 ; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movaps %xmm13, %xmm10 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: mulps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: mulps %xmm8, %xmm1 ; SSE-NEXT: addps %xmm7, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: mulps %xmm0, %xmm7 ; SSE-NEXT: addps %xmm1, %xmm7 ; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 @@ -1937,8 +1927,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: addps %xmm7, %xmm1 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] -; SSE-NEXT: mulps %xmm0, %xmm15 -; SSE-NEXT: addps %xmm1, %xmm15 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulps %xmm0, %xmm7 +; SSE-NEXT: addps %xmm1, %xmm7 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulps %xmm1, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 @@ -1948,19 +1939,20 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm0, %xmm2 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addps %xmm15, %xmm1 +; SSE-NEXT: addps %xmm7, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: mulps %xmm4, %xmm0 ; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addps %xmm2, %xmm4 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm6 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1] @@ -1969,24 +1961,26 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulps %xmm14, %xmm15 ; SSE-NEXT: addps %xmm2, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulps %xmm6, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: mulps %xmm7, %xmm14 ; SSE-NEXT: addps %xmm1, %xmm14 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm14, %xmm2 ; SSE-NEXT: mulps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: addps %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm14 ; SSE-NEXT: mulps %xmm0, %xmm14 ; SSE-NEXT: addps %xmm1, %xmm14 ; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm12 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm1 @@ -1994,7 +1988,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 ; SSE-NEXT: mulps %xmm1, %xmm15 ; SSE-NEXT: addps %xmm0, %xmm15 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 ; SSE-NEXT: addps %xmm14, %xmm1 ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] @@ -2019,28 +2014,28 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; SSE-NEXT: mulps %xmm1, %xmm7 +; SSE-NEXT: mulps %xmm1, %xmm6 ; SSE-NEXT: movaps %xmm0, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[1,1] ; SSE-NEXT: mulps %xmm15, %xmm13 -; SSE-NEXT: addps %xmm7, %xmm13 -; SSE-NEXT: mulps %xmm5, %xmm1 -; SSE-NEXT: mulps %xmm6, %xmm15 -; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: addps %xmm6, %xmm13 +; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: mulps %xmm7, %xmm15 ; SSE-NEXT: addps %xmm1, %xmm15 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm15, %xmm2 ; SSE-NEXT: mulps %xmm9, %xmm1 ; SSE-NEXT: addps %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: mulps %xmm0, %xmm9 ; SSE-NEXT: addps %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: mulps %xmm12, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm5 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2048,7 +2043,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 ; SSE-NEXT: mulps %xmm2, %xmm15 ; SSE-NEXT: addps %xmm0, %xmm15 -; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm2 ; SSE-NEXT: addps %xmm9, %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] @@ -2075,25 +2071,27 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: mulps %xmm2, %xmm13 -; SSE-NEXT: mulps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm8, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: mulps %xmm9, %xmm5 -; SSE-NEXT: addps %xmm13, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: mulps %xmm9, %xmm8 +; SSE-NEXT: addps %xmm13, %xmm8 ; SSE-NEXT: mulps %xmm7, %xmm9 ; SSE-NEXT: addps %xmm2, %xmm9 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[2,2] -; SSE-NEXT: mulps %xmm2, %xmm10 -; SSE-NEXT: addps %xmm9, %xmm10 -; SSE-NEXT: mulps %xmm12, %xmm2 -; SSE-NEXT: addps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm2, %xmm6 +; SSE-NEXT: addps %xmm9, %xmm6 +; SSE-NEXT: mulps %xmm11, %xmm2 +; SSE-NEXT: addps %xmm8, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: mulps %xmm0, %xmm8 -; SSE-NEXT: addps %xmm2, %xmm8 -; SSE-NEXT: mulps %xmm6, %xmm0 -; SSE-NEXT: addps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: mulps %xmm0, %xmm9 +; SSE-NEXT: addps %xmm2, %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: addps %xmm6, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm9[0,0] @@ -2101,7 +2099,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm2, %xmm13 ; SSE-NEXT: addps %xmm0, %xmm13 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: addps %xmm8, %xmm2 +; SSE-NEXT: addps %xmm12, %xmm2 ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 @@ -2128,12 +2126,12 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm1, 192(%rdi) ; SSE-NEXT: movaps %xmm14, 176(%rdi) ; SSE-NEXT: movaps %xmm3, 160(%rdi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdi) ; SSE-NEXT: movaps %xmm4, 128(%rdi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdi) -; SSE-NEXT: movaps %xmm11, 96(%rdi) +; SSE-NEXT: movaps %xmm10, 96(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2146,7 +2144,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm0, 16(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdi) -; SSE-NEXT: addq $88, %rsp +; SSE-NEXT: addq $120, %rsp ; SSE-NEXT: retq ; ; AVX1-LABEL: test_mul8x8_f32: @@ -3287,48 +3285,48 @@ entry: define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) nounwind { ; SSE-LABEL: test_mul8x8_f64: ; SSE: # %bb.0: # %entry -; SSE-NEXT: subq $344, %rsp # imm = 0x158 -; SSE-NEXT: movapd %xmm7, %xmm14 -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $328, %rsp # imm = 0x148 +; SSE-NEXT: movapd %xmm7, %xmm15 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm4, %xmm15 -; SSE-NEXT: movapd %xmm3, %xmm10 +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 -; SSE-NEXT: movapd %xmm12, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; SSE-NEXT: mulpd %xmm3, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: movapd %xmm3, %xmm10 +; SSE-NEXT: mulpd %xmm12, %xmm10 ; SSE-NEXT: movapd %xmm2, %xmm8 -; SSE-NEXT: mulpd %xmm3, %xmm8 +; SSE-NEXT: mulpd %xmm12, %xmm8 ; SSE-NEXT: movapd %xmm1, %xmm9 -; SSE-NEXT: mulpd %xmm3, %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] +; SSE-NEXT: mulpd %xmm12, %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1,1] ; SSE-NEXT: movapd %xmm7, %xmm2 -; SSE-NEXT: mulpd %xmm12, %xmm2 +; SSE-NEXT: mulpd %xmm13, %xmm2 ; SSE-NEXT: addpd %xmm10, %xmm2 ; SSE-NEXT: movapd %xmm6, %xmm7 -; SSE-NEXT: mulpd %xmm12, %xmm7 +; SSE-NEXT: movapd %xmm6, %xmm10 +; SSE-NEXT: mulpd %xmm13, %xmm7 ; SSE-NEXT: addpd %xmm8, %xmm7 ; SSE-NEXT: movapd %xmm5, %xmm8 -; SSE-NEXT: mulpd %xmm12, %xmm8 +; SSE-NEXT: mulpd %xmm13, %xmm8 ; SSE-NEXT: addpd %xmm9, %xmm8 -; SSE-NEXT: mulpd %xmm4, %xmm12 -; SSE-NEXT: addpd %xmm3, %xmm12 +; SSE-NEXT: mulpd %xmm4, %xmm13 +; SSE-NEXT: addpd %xmm12, %xmm13 ; SSE-NEXT: movapd %xmm11, %xmm6 ; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: movapd %xmm13, %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 -; SSE-NEXT: addpd %xmm12, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm14, %xmm1 ; SSE-NEXT: mulpd %xmm6, %xmm1 -; SSE-NEXT: addpd %xmm8, %xmm1 +; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm8, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm6, %xmm5 ; SSE-NEXT: addpd %xmm7, %xmm5 @@ -3343,9 +3341,9 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm11, %xmm5 -; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm3, %xmm5 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm11 -; SSE-NEXT: addpd %xmm3, %xmm11 +; SSE-NEXT: addpd %xmm1, %xmm11 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm6 ; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] @@ -3405,31 +3403,34 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movapd %xmm11, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movapd %xmm14, %xmm2 +; SSE-NEXT: movapd %xmm15, %xmm8 +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movapd %xmm12, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movapd %xmm9, %xmm4 +; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd %xmm10, %xmm15 +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm10, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movapd %xmm11, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movapd %xmm10, %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: mulpd %xmm12, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: mulpd %xmm14, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm6 @@ -3455,7 +3456,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm0, %xmm5 ; SSE-NEXT: addpd %xmm1, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm6 @@ -3481,7 +3483,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm7, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm2, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: movapd %xmm6, %xmm3 @@ -3516,26 +3519,28 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: movapd %xmm11, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm13, %xmm2 +; SSE-NEXT: movapd %xmm8, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: movapd %xmm9, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm9, %xmm4 +; SSE-NEXT: movapd %xmm15, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: movapd %xmm13, %xmm8 +; SSE-NEXT: movapd %xmm13, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm11, %xmm5 -; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: movapd %xmm10, %xmm5 +; SSE-NEXT: movapd %xmm10, %xmm15 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: movapd %xmm8, %xmm11 -; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: movapd %xmm12, %xmm10 +; SSE-NEXT: mulpd %xmm12, %xmm0 +; SSE-NEXT: movapd %xmm14, %xmm9 +; SSE-NEXT: mulpd %xmm14, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm6 @@ -3562,8 +3567,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm0, %xmm5 ; SSE-NEXT: addpd %xmm1, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm6 @@ -3577,8 +3581,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm6, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: addpd %xmm2, %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 @@ -3593,9 +3596,9 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm2, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movapd %xmm7, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 @@ -3608,205 +3611,209 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm4, %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm7, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm7, %xmm0 ; SSE-NEXT: addpd %xmm5, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm7, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm0, %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd %xmm0, %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd %xmm10, %xmm8 -; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: movapd %xmm11, %xmm3 +; SSE-NEXT: movapd %xmm11, %xmm12 ; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm13, %xmm2 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movapd %xmm6, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movapd %xmm11, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movapd %xmm10, %xmm4 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: movapd %xmm8, %xmm3 +; SSE-NEXT: movapd %xmm8, %xmm14 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm12, %xmm14 -; SSE-NEXT: movapd %xmm12, %xmm5 +; SSE-NEXT: movapd %xmm15, %xmm8 +; SSE-NEXT: movapd %xmm15, %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm3, %xmm5 -; SSE-NEXT: mulpd %xmm11, %xmm0 -; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: mulpd %xmm10, %xmm0 +; SSE-NEXT: mulpd %xmm9, %xmm1 +; SSE-NEXT: movapd %xmm9, %xmm10 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm6, %xmm1 +; SSE-NEXT: mulpd %xmm7, %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: mulpd %xmm7, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm0, %xmm2 -; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm7, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm0, %xmm4 ; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm0, %xmm5 ; SSE-NEXT: addpd %xmm1, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm1, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm6 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movapd %xmm1, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm6, %xmm7 -; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm7, %xmm9 +; SSE-NEXT: addpd %xmm5, %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: mulpd %xmm7, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm5, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm1, %xmm6 -; SSE-NEXT: addpd %xmm7, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm1, %xmm7 +; SSE-NEXT: addpd %xmm9, %xmm7 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movapd %xmm15, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm15[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm6, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm3, %xmm6 -; SSE-NEXT: addpd %xmm4, %xmm6 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm3, %xmm7 +; SSE-NEXT: addpd %xmm4, %xmm7 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm5, %xmm0 +; SSE-NEXT: mulpd %xmm15, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm5, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm15, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm5, %xmm0 +; SSE-NEXT: mulpd %xmm15, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: addpd %xmm2, %xmm5 -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: addpd %xmm2, %xmm15 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: mulpd %xmm0, %xmm8 +; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm13, %xmm2 -; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: movapd %xmm6, %xmm2 +; SSE-NEXT: movapd %xmm6, %xmm12 ; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm8, %xmm2 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: movapd %xmm10, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm11 +; SSE-NEXT: movapd %xmm13, %xmm6 +; SSE-NEXT: movapd %xmm13, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movapd %xmm5, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm14, %xmm6 -; SSE-NEXT: mulpd %xmm1, %xmm6 -; SSE-NEXT: addpd %xmm3, %xmm6 -; SSE-NEXT: movapd %xmm11, %xmm8 -; SSE-NEXT: mulpd %xmm11, %xmm0 -; SSE-NEXT: movapd %xmm15, %xmm10 -; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: addpd %xmm11, %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: movapd %xmm8, %xmm7 +; SSE-NEXT: mulpd %xmm1, %xmm7 +; SSE-NEXT: addpd %xmm14, %xmm7 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: movapd %xmm10, %xmm5 +; SSE-NEXT: mulpd %xmm10, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: movapd %xmm0, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm7, %xmm3 +; SSE-NEXT: mulpd %xmm9, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm7, %xmm1 -; SSE-NEXT: addpd %xmm6, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm7, %xmm6 -; SSE-NEXT: addpd %xmm4, %xmm6 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: mulpd %xmm9, %xmm1 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm9, %xmm7 +; SSE-NEXT: addpd %xmm4, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: addpd %xmm2, %xmm9 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm0, %xmm2 -; SSE-NEXT: addpd %xmm7, %xmm2 +; SSE-NEXT: addpd %xmm9, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm6, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm0, %xmm6 -; SSE-NEXT: addpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm7, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm0, %xmm7 +; SSE-NEXT: addpd %xmm1, %xmm7 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm7, %xmm3 +; SSE-NEXT: mulpd %xmm9, %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm7, %xmm9 -; SSE-NEXT: addpd %xmm6, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm7, %xmm6 -; SSE-NEXT: addpd %xmm4, %xmm6 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm9, %xmm10 +; SSE-NEXT: addpd %xmm7, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm9, %xmm7 +; SSE-NEXT: addpd %xmm4, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: addpd %xmm2, %xmm9 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm7, %xmm0 +; SSE-NEXT: addpd %xmm9, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm1, %xmm9 +; SSE-NEXT: addpd %xmm7, %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm6, %xmm7 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm1, %xmm6 -; SSE-NEXT: addpd %xmm9, %xmm6 +; SSE-NEXT: addpd %xmm10, %xmm7 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 @@ -3817,12 +3824,11 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm6, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: mulpd %xmm3, %xmm6 -; SSE-NEXT: addpd %xmm7, %xmm6 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm4, %xmm3 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm3, %xmm7 +; SSE-NEXT: addpd %xmm9, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: addpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 @@ -3831,7 +3837,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm11, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm11, %xmm0 @@ -3846,239 +3852,245 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm13, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm12, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movapd %xmm14, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movapd %xmm15, %xmm6 -; SSE-NEXT: mulpd %xmm1, %xmm6 -; SSE-NEXT: addpd %xmm3, %xmm6 -; SSE-NEXT: mulpd %xmm0, %xmm5 -; SSE-NEXT: movapd %xmm14, %xmm4 -; SSE-NEXT: movapd %xmm14, %xmm7 +; SSE-NEXT: movapd %xmm6, %xmm7 ; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: addpd %xmm3, %xmm7 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movapd %xmm4, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movapd %xmm6, %xmm9 +; SSE-NEXT: mulpd %xmm1, %xmm9 +; SSE-NEXT: addpd %xmm3, %xmm9 ; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: mulpd %xmm10, %xmm1 -; SSE-NEXT: movapd %xmm10, %xmm14 +; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm9 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movapd %xmm0, %xmm10 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: mulpd %xmm9, %xmm3 +; SSE-NEXT: mulpd %xmm10, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm9, %xmm10 -; SSE-NEXT: addpd %xmm7, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm9, %xmm7 -; SSE-NEXT: addpd %xmm6, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: addpd %xmm2, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm10, %xmm12 +; SSE-NEXT: addpd %xmm9, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm10, %xmm9 +; SSE-NEXT: addpd %xmm7, %xmm9 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: addpd %xmm2, %xmm10 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: addpd %xmm9, %xmm1 +; SSE-NEXT: addpd %xmm10, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm0, %xmm10 +; SSE-NEXT: addpd %xmm9, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 ; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: addpd %xmm7, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm0, %xmm7 -; SSE-NEXT: addpd %xmm10, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm12, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movapd %xmm7, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm0, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm3, %xmm10 -; SSE-NEXT: addpd %xmm7, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm3, %xmm7 -; SSE-NEXT: addpd %xmm9, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm3, %xmm12 +; SSE-NEXT: addpd %xmm9, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm3, %xmm9 +; SSE-NEXT: addpd %xmm10, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: addpd %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm7, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm7, %xmm10 +; SSE-NEXT: addpd %xmm9, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm6, %xmm9 -; SSE-NEXT: addpd %xmm7, %xmm9 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: mulpd %xmm6, %xmm7 -; SSE-NEXT: addpd %xmm10, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: mulpd %xmm7, %xmm9 +; SSE-NEXT: addpd %xmm12, %xmm9 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 ; SSE-NEXT: movapd %xmm8, %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm8[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: addpd %xmm6, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm2, %xmm10 -; SSE-NEXT: addpd %xmm7, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm6 -; SSE-NEXT: mulpd %xmm2, %xmm6 -; SSE-NEXT: addpd %xmm9, %xmm6 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm7, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm2, %xmm12 +; SSE-NEXT: addpd %xmm9, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm2, %xmm7 +; SSE-NEXT: addpd %xmm10, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm0, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movapd %xmm7, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm8, %xmm0 ; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 ; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: addpd %xmm10, %xmm0 -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm8, %xmm9 +; SSE-NEXT: addpd %xmm12, %xmm9 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm0, %xmm8 ; SSE-NEXT: addpd %xmm1, %xmm8 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: mulpd %xmm1, %xmm3 -; SSE-NEXT: addpd %xmm13, %xmm3 +; SSE-NEXT: addpd %xmm12, %xmm3 +; SSE-NEXT: movapd %xmm14, %xmm12 +; SSE-NEXT: movapd %xmm14, %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm12 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movapd %xmm13, %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: movapd %xmm15, %xmm10 -; SSE-NEXT: mulpd %xmm1, %xmm10 -; SSE-NEXT: addpd %xmm9, %xmm10 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movapd %xmm12, %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm13 +; SSE-NEXT: addpd %xmm12, %xmm13 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: movapd %xmm6, %xmm14 +; SSE-NEXT: mulpd %xmm1, %xmm14 +; SSE-NEXT: addpd %xmm4, %xmm14 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: mulpd %xmm6, %xmm0 -; SSE-NEXT: mulpd %xmm14, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: mulpd %xmm10, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: addpd %xmm1, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: addpd %xmm1, %xmm12 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm0, %xmm1 -; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulpd %xmm0, %xmm15 -; SSE-NEXT: addpd %xmm10, %xmm15 +; SSE-NEXT: addpd %xmm14, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: addpd %xmm13, %xmm14 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm2, %xmm10 -; SSE-NEXT: addpd %xmm0, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm2, %xmm13 +; SSE-NEXT: addpd %xmm0, %xmm13 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm2, %xmm0 -; SSE-NEXT: addpd %xmm15, %xmm0 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulpd %xmm2, %xmm15 -; SSE-NEXT: addpd %xmm1, %xmm15 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: addpd %xmm9, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: movapd %xmm9, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: addpd %xmm14, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm2, %xmm14 +; SSE-NEXT: addpd %xmm1, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm12, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: movapd %xmm12, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: mulpd %xmm1, %xmm3 ; SSE-NEXT: addpd %xmm2, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 -; SSE-NEXT: addpd %xmm15, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulpd %xmm1, %xmm15 -; SSE-NEXT: addpd %xmm0, %xmm15 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: addpd %xmm10, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1,1] +; SSE-NEXT: addpd %xmm14, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm1, %xmm14 +; SSE-NEXT: addpd %xmm0, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm13, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: mulpd %xmm9, %xmm4 +; SSE-NEXT: mulpd %xmm12, %xmm4 ; SSE-NEXT: addpd %xmm1, %xmm4 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm9, %xmm10 -; SSE-NEXT: addpd %xmm15, %xmm10 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 -; SSE-NEXT: mulpd %xmm9, %xmm15 -; SSE-NEXT: addpd %xmm2, %xmm15 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: addpd %xmm3, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm12, %xmm13 +; SSE-NEXT: addpd %xmm14, %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm12, %xmm14 +; SSE-NEXT: addpd %xmm2, %xmm14 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: addpd %xmm3, %xmm12 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm3 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm9, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm3, %xmm9 -; SSE-NEXT: addpd %xmm15, %xmm9 -; SSE-NEXT: movapd %xmm5, %xmm0 +; SSE-NEXT: addpd %xmm12, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulpd %xmm3, %xmm12 +; SSE-NEXT: addpd %xmm14, %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm3, %xmm0 -; SSE-NEXT: addpd %xmm10, %xmm0 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm13, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm4, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movapd %xmm7, %xmm15 -; SSE-NEXT: mulpd %xmm2, %xmm15 -; SSE-NEXT: addpd %xmm3, %xmm15 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 -; SSE-NEXT: mulpd %xmm2, %xmm10 -; SSE-NEXT: addpd %xmm0, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm2, %xmm14 +; SSE-NEXT: addpd %xmm3, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm2, %xmm13 +; SSE-NEXT: addpd %xmm0, %xmm13 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: mulpd %xmm2, %xmm7 -; SSE-NEXT: addpd %xmm9, %xmm7 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm12, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: mulpd %xmm0, %xmm9 -; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: mulpd %xmm6, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm9 +; SSE-NEXT: addpd %xmm12, %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm12 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm13, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm5 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm12, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: mulpd %xmm14, %xmm1 +; SSE-NEXT: mulpd %xmm10, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm4 ; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 -; SSE-NEXT: mulpd %xmm4, %xmm14 -; SSE-NEXT: addpd %xmm1, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm4, %xmm10 +; SSE-NEXT: addpd %xmm1, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm4, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 @@ -4088,12 +4100,12 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm5 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: addpd %xmm12, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm0, %xmm1 ; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: movapd %xmm1, %xmm12 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: mulpd %xmm0, %xmm6 ; SSE-NEXT: addpd %xmm5, %xmm6 @@ -4102,7 +4114,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm3 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: addpd %xmm14, %xmm0 +; SSE-NEXT: addpd %xmm10, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm4 ; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] @@ -4112,13 +4124,13 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm4, %xmm0 ; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm14 +; SSE-NEXT: movapd %xmm0, %xmm10 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm4, %xmm0 ; SSE-NEXT: addpd %xmm6, %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm6 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: addpd %xmm12, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 @@ -4130,8 +4142,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm0, %xmm6 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: addpd %xmm14, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm9 +; SSE-NEXT: addpd %xmm10, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm10 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 @@ -4142,10 +4154,10 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm1, %xmm5 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: mulpd %xmm4, %xmm1 -; SSE-NEXT: addpd %xmm9, %xmm1 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: mulpd %xmm4, %xmm9 -; SSE-NEXT: addpd %xmm6, %xmm9 +; SSE-NEXT: addpd %xmm10, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm4, %xmm10 +; SSE-NEXT: addpd %xmm6, %xmm10 ; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] @@ -4154,7 +4166,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm4, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm0, %xmm4 -; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: addpd %xmm10, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 ; SSE-NEXT: mulpd %xmm0, %xmm6 ; SSE-NEXT: addpd %xmm1, %xmm6 @@ -4164,16 +4176,15 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm4, 480(%rdi) ; SSE-NEXT: movapd %xmm6, 464(%rdi) ; SSE-NEXT: movapd %xmm0, 448(%rdi) -; SSE-NEXT: movapd %xmm15, 432(%rdi) -; SSE-NEXT: movapd %xmm10, 416(%rdi) +; SSE-NEXT: movapd %xmm14, 432(%rdi) +; SSE-NEXT: movapd %xmm13, 416(%rdi) ; SSE-NEXT: movapd %xmm7, 400(%rdi) ; SSE-NEXT: movapd %xmm2, 384(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rdi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rdi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rdi) +; SSE-NEXT: movapd %xmm9, 336(%rdi) ; SSE-NEXT: movapd %xmm8, 320(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rdi) @@ -4188,8 +4199,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movaps %xmm0, 224(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdi) +; SSE-NEXT: movapd %xmm15, 192(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4214,7 +4224,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movaps %xmm0, 16(%rdi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdi) -; SSE-NEXT: addq $344, %rsp # imm = 0x158 +; SSE-NEXT: addq $328, %rsp # imm = 0x148 ; SSE-NEXT: retq ; ; AVX1-LABEL: test_mul8x8_f64: @@ -4223,370 +4233,371 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $448, %rsp # imm = 0x1C0 -; AVX1-NEXT: vmovapd %ymm4, %ymm13 -; AVX1-NEXT: vmovapd %ymm3, %ymm9 -; AVX1-NEXT: vmovapd %ymm1, %ymm4 +; AVX1-NEXT: vmovapd %ymm2, %ymm12 ; AVX1-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill ; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: vmovapd 112(%rbp), %ymm12 -; AVX1-NEXT: vmovapd 48(%rbp), %ymm15 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm2 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm13 ; AVX1-NEXT: vbroadcastsd 272(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm8 -; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, %ymm9 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 280(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm11, %ymm8, %ymm3 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm2, %ymm8 -; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm0 +; AVX1-NEXT: vaddpd %ymm11, %ymm8, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 288(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 296(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 304(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vmovapd %ymm12, %ymm14 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmovapd %ymm13, %ymm14 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm2, %ymm13 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 328(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 336(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 ; AVX1-NEXT: vbroadcastsd 344(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 -; AVX1-NEXT: vmovapd %ymm9, %ymm12 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd (%rsp), %ymm2 # 32-byte Reload -; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm3, %ymm8 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload +; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 352(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd %ymm5, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 360(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 368(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 16(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 376(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd 80(%rbp), %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 384(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 144(%rbp), %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 176(%rbp), %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 392(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 240(%rbp), %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 400(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm3 -; AVX1-NEXT: vmovapd %ymm4, %ymm9 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 ; AVX1-NEXT: vbroadcastsd 408(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1-NEXT: vmovapd %ymm8, %ymm5 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 416(%rbp), %ymm10 -; AVX1-NEXT: vmovapd %ymm13, %ymm4 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm5, %ymm13 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd %ymm3, %ymm2 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 424(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm7, %ymm5 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX1-NEXT: vmovapd %ymm6, %ymm7 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 432(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 440(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 448(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 456(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 464(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm3 -; AVX1-NEXT: vmovapd %ymm9, %ymm6 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX1-NEXT: vmovapd %ymm9, %ymm13 ; AVX1-NEXT: vbroadcastsd 472(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmovapd %ymm2, %ymm1 -; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 -; AVX1-NEXT: vmovapd %ymm8, %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vmovapd %ymm15, %ymm9 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 480(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX1-NEXT: vmovapd %ymm4, %ymm15 +; AVX1-NEXT: vmovapd %ymm4, %ymm3 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd %ymm13, %ymm9 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd %ymm2, %ymm15 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 488(%rbp), %ymm10 -; AVX1-NEXT: vmovapd %ymm5, %ymm8 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd %ymm7, %ymm8 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm6, %ymm7 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 496(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmovapd 48(%rbp), %ymm4 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 504(%rbp), %ymm10 ; AVX1-NEXT: vmovapd 112(%rbp), %ymm2 ; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 512(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 144(%rbp), %ymm13 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 520(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm2 -; AVX1-NEXT: vmovapd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovapd 208(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 528(%rbp), %ymm0 -; AVX1-NEXT: vmovapd %ymm6, %ymm2 -; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX1-NEXT: vbroadcastsd 536(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm5, %ymm6 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX1-NEXT: vmovapd %ymm12, %ymm5 -; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovapd %ymm1, %ymm12 -; AVX1-NEXT: vmovapd %ymm14, %ymm6 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 544(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm3, %ymm12 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 552(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 560(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 16(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd %ymm4, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 568(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd 80(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 576(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 584(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 240(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovapd 208(%rbp), %ymm14 -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovapd 208(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 592(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX1-NEXT: vbroadcastsd 600(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm0, %ymm12, %ymm0 -; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 608(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 616(%rbp), %ymm10 ; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 624(%rbp), %ymm10 ; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmovapd 48(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 632(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 112(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 ; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX1-NEXT: vbroadcastsd 640(%rbp), %ymm10 -; AVX1-NEXT: vmovapd 144(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX1-NEXT: vbroadcastsd 648(%rbp), %ymm10 -; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm14, %ymm4 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vbroadcastsd 656(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm2, %ymm3 +; AVX1-NEXT: vmovapd %ymm13, %ymm3 +; AVX1-NEXT: vmulpd %ymm1, %ymm13, %ymm2 ; AVX1-NEXT: vbroadcastsd 664(%rbp), %ymm0 -; AVX1-NEXT: vmulpd %ymm0, %ymm5, %ymm13 -; AVX1-NEXT: vaddpd %ymm3, %ymm13, %ymm3 -; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm1 -; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm14 +; AVX1-NEXT: vmovapd %ymm6, %ymm10 +; AVX1-NEXT: vaddpd %ymm2, %ymm14, %ymm2 +; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm5, %ymm0 +; AVX1-NEXT: vmovapd %ymm5, %ymm6 ; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vbroadcastsd 672(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm15, %ymm13 -; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm3, %ymm1 -; AVX1-NEXT: vbroadcastsd 680(%rbp), %ymm3 -; AVX1-NEXT: vmulpd %ymm3, %ymm8, %ymm13 -; AVX1-NEXT: vaddpd %ymm1, %ymm13, %ymm1 -; AVX1-NEXT: vmulpd %ymm3, %ymm7, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 688(%rbp), %ymm3 -; AVX1-NEXT: vmovapd 16(%rbp), %ymm10 -; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm13 -; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX1-NEXT: vmulpd 48(%rbp), %ymm3, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd 696(%rbp), %ymm3 -; AVX1-NEXT: vmulpd 112(%rbp), %ymm3, %ymm13 -; AVX1-NEXT: vaddpd %ymm1, %ymm13, %ymm1 -; AVX1-NEXT: vmulpd 80(%rbp), %ymm3, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 704(%rbp), %ymm3 -; AVX1-NEXT: vmulpd 144(%rbp), %ymm3, %ymm13 -; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX1-NEXT: vmulpd 176(%rbp), %ymm3, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd 712(%rbp), %ymm13 -; AVX1-NEXT: vmovapd 240(%rbp), %ymm11 -; AVX1-NEXT: vmulpd %ymm13, %ymm11, %ymm3 -; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm3 -; AVX1-NEXT: vmulpd %ymm13, %ymm14, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vbroadcastsd 720(%rbp), %ymm1 -; AVX1-NEXT: vmulpd %ymm1, %ymm2, %ymm4 -; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm1 -; AVX1-NEXT: vbroadcastsd 728(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm4, %ymm4 -; AVX1-NEXT: vmulpd %ymm2, %ymm6, %ymm2 +; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm14 +; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vmulpd %ymm1, %ymm15, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vbroadcastsd 680(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm8, %ymm14 +; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX1-NEXT: vmulpd %ymm2, %ymm7, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 688(%rbp), %ymm2 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm11 +; AVX1-NEXT: vmulpd %ymm2, %ymm11, %ymm14 +; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 696(%rbp), %ymm2 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm14 +; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm5 +; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 704(%rbp), %ymm2 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 +; AVX1-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 +; AVX1-NEXT: vmulpd %ymm2, %ymm13, %ymm2 ; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vbroadcastsd 736(%rbp), %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm15, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 712(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm4, %ymm14 +; AVX1-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX1-NEXT: vmovapd 208(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm2, %ymm14, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 720(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm3 ; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vbroadcastsd 728(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 +; AVX1-NEXT: vmulpd %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd 736(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm12, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vmulpd %ymm4, %ymm15, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vbroadcastsd 744(%rbp), %ymm4 ; AVX1-NEXT: vmulpd %ymm4, %ymm8, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vmulpd %ymm4, %ymm7, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vbroadcastsd 752(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm4, %ymm11, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vbroadcastsd 760(%rbp), %ymm4 ; AVX1-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vbroadcastsd 768(%rbp), %ymm4 ; AVX1-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 -; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vmulpd 176(%rbp), %ymm4, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vbroadcastsd 776(%rbp), %ymm4 -; AVX1-NEXT: vmulpd %ymm4, %ymm11, %ymm5 ; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vmulpd %ymm4, %ymm13, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastsd 776(%rbp), %ymm4 +; AVX1-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vmulpd %ymm4, %ymm14, %ymm4 -; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vmovapd %ymm2, 480(%rdi) -; AVX1-NEXT: vmovapd %ymm1, 448(%rdi) -; AVX1-NEXT: vmovapd %ymm3, 416(%rdi) +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vmovapd %ymm3, 480(%rdi) +; AVX1-NEXT: vmovapd %ymm2, 448(%rdi) +; AVX1-NEXT: vmovapd %ymm1, 416(%rdi) ; AVX1-NEXT: vmovapd %ymm0, 384(%rdi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 352(%rdi) @@ -4623,370 +4634,371 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; AVX2-NEXT: movq %rsp, %rbp ; AVX2-NEXT: andq $-32, %rsp ; AVX2-NEXT: subq $448, %rsp # imm = 0x1C0 -; AVX2-NEXT: vmovapd %ymm4, %ymm13 -; AVX2-NEXT: vmovapd %ymm3, %ymm9 -; AVX2-NEXT: vmovapd %ymm1, %ymm4 +; AVX2-NEXT: vmovapd %ymm2, %ymm12 ; AVX2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: vmovapd 112(%rbp), %ymm12 -; AVX2-NEXT: vmovapd 48(%rbp), %ymm15 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm2 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm13 ; AVX2-NEXT: vbroadcastsd 272(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm8 -; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, %ymm9 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 280(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm11, %ymm8, %ymm3 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm2, %ymm8 -; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm0 +; AVX2-NEXT: vaddpd %ymm11, %ymm8, %ymm1 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 288(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 296(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 304(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 312(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vmovapd %ymm12, %ymm14 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmovapd %ymm13, %ymm14 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 320(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm2, %ymm13 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 328(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 336(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm4, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 ; AVX2-NEXT: vbroadcastsd 344(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 -; AVX2-NEXT: vmovapd %ymm9, %ymm12 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd (%rsp), %ymm2 # 32-byte Reload -; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm3, %ymm8 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd (%rsp), %ymm15 # 32-byte Reload +; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 352(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd %ymm5, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 360(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 368(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 16(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 376(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd 80(%rbp), %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 384(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 144(%rbp), %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 176(%rbp), %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 392(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 240(%rbp), %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 400(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm4, %ymm3 -; AVX2-NEXT: vmovapd %ymm4, %ymm9 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 ; AVX2-NEXT: vbroadcastsd 408(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX2-NEXT: vmovapd %ymm8, %ymm5 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 416(%rbp), %ymm10 -; AVX2-NEXT: vmovapd %ymm13, %ymm4 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm5, %ymm13 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd %ymm3, %ymm2 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 424(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm7, %ymm5 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 -; AVX2-NEXT: vmovapd %ymm6, %ymm7 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 432(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 440(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 448(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 456(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 464(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm3 -; AVX2-NEXT: vmovapd %ymm9, %ymm6 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm1 +; AVX2-NEXT: vmovapd %ymm9, %ymm13 ; AVX2-NEXT: vbroadcastsd 472(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovapd %ymm2, %ymm1 -; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 -; AVX2-NEXT: vmovapd %ymm8, %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm15, %ymm0 +; AVX2-NEXT: vmovapd %ymm15, %ymm9 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 480(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 -; AVX2-NEXT: vmovapd %ymm4, %ymm15 +; AVX2-NEXT: vmovapd %ymm4, %ymm3 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd %ymm13, %ymm9 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd %ymm2, %ymm15 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 488(%rbp), %ymm10 -; AVX2-NEXT: vmovapd %ymm5, %ymm8 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd %ymm7, %ymm8 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm6, %ymm7 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 496(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmovapd 48(%rbp), %ymm4 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 504(%rbp), %ymm10 ; AVX2-NEXT: vmovapd 112(%rbp), %ymm2 ; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 512(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 144(%rbp), %ymm13 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 520(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm2 -; AVX2-NEXT: vmovapd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovapd 208(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 528(%rbp), %ymm0 -; AVX2-NEXT: vmovapd %ymm6, %ymm2 -; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX2-NEXT: vbroadcastsd 536(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm5, %ymm6 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm10 ; AVX2-NEXT: vmovapd %ymm12, %ymm5 -; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovapd %ymm1, %ymm12 -; AVX2-NEXT: vmovapd %ymm14, %ymm6 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 544(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm3, %ymm12 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 552(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 560(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 16(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd %ymm4, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 568(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd 80(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 576(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 176(%rbp), %ymm13 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 584(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 240(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovapd 208(%rbp), %ymm14 -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovapd 208(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 592(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm13, %ymm1 ; AVX2-NEXT: vbroadcastsd 600(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm0, %ymm12, %ymm0 -; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 608(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 616(%rbp), %ymm10 ; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 624(%rbp), %ymm10 ; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmovapd 48(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 632(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 112(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 ; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vbroadcastsd 640(%rbp), %ymm10 -; AVX2-NEXT: vmovapd 144(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 ; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 -; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 -; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm1 ; AVX2-NEXT: vbroadcastsd 648(%rbp), %ymm10 -; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 -; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm14, %ymm4 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm1, %ymm11, %ymm1 ; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vbroadcastsd 656(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vmovapd %ymm13, %ymm3 +; AVX2-NEXT: vmulpd %ymm1, %ymm13, %ymm2 ; AVX2-NEXT: vbroadcastsd 664(%rbp), %ymm0 -; AVX2-NEXT: vmulpd %ymm0, %ymm5, %ymm13 -; AVX2-NEXT: vaddpd %ymm3, %ymm13, %ymm3 -; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm1 -; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm14 +; AVX2-NEXT: vmovapd %ymm6, %ymm10 +; AVX2-NEXT: vaddpd %ymm2, %ymm14, %ymm2 +; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vmovapd %ymm5, %ymm6 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vbroadcastsd 672(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm15, %ymm13 -; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vbroadcastsd 680(%rbp), %ymm3 -; AVX2-NEXT: vmulpd %ymm3, %ymm8, %ymm13 -; AVX2-NEXT: vaddpd %ymm1, %ymm13, %ymm1 -; AVX2-NEXT: vmulpd %ymm3, %ymm7, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 688(%rbp), %ymm3 -; AVX2-NEXT: vmovapd 16(%rbp), %ymm10 -; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm13 -; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX2-NEXT: vmulpd 48(%rbp), %ymm3, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd 696(%rbp), %ymm3 -; AVX2-NEXT: vmulpd 112(%rbp), %ymm3, %ymm13 -; AVX2-NEXT: vaddpd %ymm1, %ymm13, %ymm1 -; AVX2-NEXT: vmulpd 80(%rbp), %ymm3, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 704(%rbp), %ymm3 -; AVX2-NEXT: vmulpd 144(%rbp), %ymm3, %ymm13 -; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 -; AVX2-NEXT: vmulpd 176(%rbp), %ymm3, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd 712(%rbp), %ymm13 -; AVX2-NEXT: vmovapd 240(%rbp), %ymm11 -; AVX2-NEXT: vmulpd %ymm13, %ymm11, %ymm3 -; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vmulpd %ymm13, %ymm14, %ymm1 -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd 720(%rbp), %ymm1 -; AVX2-NEXT: vmulpd %ymm1, %ymm2, %ymm4 -; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm1 -; AVX2-NEXT: vbroadcastsd 728(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vmulpd %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm14 +; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX2-NEXT: vmulpd %ymm1, %ymm15, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vbroadcastsd 680(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm8, %ymm14 +; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX2-NEXT: vmulpd %ymm2, %ymm7, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 688(%rbp), %ymm2 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm11 +; AVX2-NEXT: vmulpd %ymm2, %ymm11, %ymm14 +; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm2, %ymm2 ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastsd 736(%rbp), %ymm2 -; AVX2-NEXT: vmulpd %ymm2, %ymm15, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 696(%rbp), %ymm2 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm14 +; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm5 +; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 704(%rbp), %ymm2 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm2, %ymm14 +; AVX2-NEXT: vaddpd %ymm0, %ymm14, %ymm0 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm13 +; AVX2-NEXT: vmulpd %ymm2, %ymm13, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 712(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm4, %ymm14 +; AVX2-NEXT: vaddpd %ymm1, %ymm14, %ymm1 +; AVX2-NEXT: vmovapd 208(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm2, %ymm14, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 720(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm3 ; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 -; AVX2-NEXT: vaddpd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vbroadcastsd 728(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vmulpd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastsd 736(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm12, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmulpd %ymm4, %ymm15, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vbroadcastsd 744(%rbp), %ymm4 ; AVX2-NEXT: vmulpd %ymm4, %ymm8, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmulpd %ymm4, %ymm7, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastsd 752(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd %ymm4, %ymm11, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vbroadcastsd 760(%rbp), %ymm4 ; AVX2-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastsd 768(%rbp), %ymm4 ; AVX2-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 -; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vmulpd 176(%rbp), %ymm4, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcastsd 776(%rbp), %ymm4 -; AVX2-NEXT: vmulpd %ymm4, %ymm11, %ymm5 ; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmulpd %ymm4, %ymm13, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastsd 776(%rbp), %ymm4 +; AVX2-NEXT: vmulpd 240(%rbp), %ymm4, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmulpd %ymm4, %ymm14, %ymm4 -; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vmovapd %ymm2, 480(%rdi) -; AVX2-NEXT: vmovapd %ymm1, 448(%rdi) -; AVX2-NEXT: vmovapd %ymm3, 416(%rdi) +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vmovapd %ymm3, 480(%rdi) +; AVX2-NEXT: vmovapd %ymm2, 448(%rdi) +; AVX2-NEXT: vmovapd %ymm1, 416(%rdi) ; AVX2-NEXT: vmovapd %ymm0, 384(%rdi) ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 352(%rdi) diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index 31c090a5a2b1d..acf4d900745d3 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2639,28 +2639,28 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; ; SSE41-LABEL: vec128_i8_unsigned_reg_reg: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pminub %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE41-NEXT: pminub %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqb %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm2 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm2, %xmm1 +; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm4 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm1, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm4 -; SSE41-NEXT: pmullw %xmm3, %xmm2 ; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: packuswb %xmm4, %xmm2 -; SSE41-NEXT: paddb %xmm2, %xmm0 +; SSE41-NEXT: pmullw %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: packuswb %xmm2, %xmm3 +; SSE41-NEXT: paddb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_unsigned_reg_reg: @@ -3115,26 +3115,26 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; ; SSE41-LABEL: vec128_i8_signed_reg_mem: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa (%rdi), %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pcmpgtb %xmm2, %xmm1 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pminsb %xmm1, %xmm3 -; SSE41-NEXT: pmaxsb %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm3, %xmm1 -; SSE41-NEXT: psrlw $1, %xmm1 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pminsb %xmm2, %xmm3 +; SSE41-NEXT: pmaxsb %xmm0, %xmm2 +; SSE41-NEXT: psubb %xmm3, %xmm2 +; SSE41-NEXT: psrlw $1, %xmm2 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm2, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm2, %xmm1 ; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: packuswb %xmm1, %xmm3 ; SSE41-NEXT: paddb %xmm3, %xmm0 ; SSE41-NEXT: retq ; @@ -3354,26 +3354,26 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-LABEL: vec128_i8_signed_mem_mem: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa (%rsi), %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa (%rsi), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pminsb %xmm2, %xmm0 -; SSE41-NEXT: pmaxsb %xmm1, %xmm2 -; SSE41-NEXT: psubb %xmm0, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pminsb %xmm3, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm3 +; SSE41-NEXT: psubb %xmm0, %xmm3 +; SSE41-NEXT: psrlw $1, %xmm3 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm3, %xmm2 ; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index a6116175226ea..27a9acf181ea2 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -144,28 +144,28 @@ define void @test1(ptr %A, ptr %B) { ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: paddd %xmm0, %xmm1 -; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X32-NEXT: pmuludq %xmm0, %xmm1 -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X32-NEXT: pmuludq %xmm0, %xmm2 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pand %xmm1, %xmm0 +; X32-NEXT: paddd %xmm1, %xmm0 ; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: por %xmm0, %xmm1 +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X32-NEXT: pmuludq %xmm1, %xmm0 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; X32-NEXT: pmuludq %xmm1, %xmm2 +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: pand %xmm0, %xmm1 ; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pxor %xmm1, %xmm0 +; X32-NEXT: por %xmm1, %xmm0 ; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: pxor %xmm0, %xmm1 +; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: emms ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll index c6801b5757ea1..6829356bf107e 100644 --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -10,397 +10,396 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $400, %esp # imm = 0x190 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 60(%ecx), %esi +; X32-NEXT: movl 60(%eax), %ebp +; X32-NEXT: movl 56(%eax), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%edx), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 56(%ecx), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%eax), %ebp -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %edi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl 4(%ebx), %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 48(%edi), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 48(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 52(%edi), %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 52(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: adcl %ecx, %ebp ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 8(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %edi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 12(%eax), %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 40(%esi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 40(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 44(%esi), %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 44(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl 32(%ebp), %edi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 32(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 36(%ebp), %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 36(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %ebp ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl %edi, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 16(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl 16(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 20(%eax), %edx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %edx, %esi -; X32-NEXT: mull %edx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl 20(%eax), %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %edi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 24(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 24(%eax), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl 28(%eax), %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -412,180 +411,180 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %bl, %edi -; X32-NEXT: adcl %edi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ebp, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edx, %eax ; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -595,388 +594,390 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 24(%esi), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 24(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 28(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 28(%esi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 16(%edi), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 16(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 20(%edi), %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 20(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 8(%esi), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl 12(%esi), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 8(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 12(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl (%ebp), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 4(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 4(%ebp), %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: adcl %ecx, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X32-NEXT: adcl %edi, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax +; X32-NEXT: adcl $0, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -988,179 +989,182 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %bl, %edi -; X32-NEXT: adcl %edi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ebp, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edx, %eax ; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1174,11 +1178,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl $0, %eax ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1190,12 +1194,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1204,198 +1208,199 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 32(%ebx), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 32(%edi), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 36(%eax), %ecx -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %ebx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 36(%eax), %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %edi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 40(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 40(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 44(%eax), %ebx +; X32-NEXT: movl 44(%eax), %ecx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl %edi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1406,78 +1411,78 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movl %ebx, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %eax +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1490,58 +1495,59 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %ebp, %edi ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 52(%eax), %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 52(%eax), %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %ebx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1549,74 +1555,74 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 56(%eax), %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 60(%eax), %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl 60(%esi), %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X32-NEXT: adcl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1625,164 +1631,166 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: movl %esi, %ecx ; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %ecx, %edi -; X32-NEXT: adcl %eax, %edi +; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl %eax, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1795,27 +1803,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload @@ -1824,370 +1832,374 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edi, %esi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb %bl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: adcl %ecx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: mull %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %bl, %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl (%esp), %edx # 4-byte Reload +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %eax +; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload @@ -2211,170 +2223,175 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -2386,14 +2403,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -2420,286 +2437,289 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 64(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 64(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 68(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 68(%eax), %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 72(%eax), %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 76(%eax), %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: mull %edx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %bl -; X32-NEXT: movl %ecx, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 72(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 76(%eax), %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: movl %edi, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -2707,13 +2727,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 80(%eax), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx @@ -2721,274 +2741,276 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 84(%eax), %ecx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ecx, %ebp -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 88(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl 88(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 92(%eax), %edi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 92(%eax), %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %edi, %ecx -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: addl %ebp, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %edx, %eax ; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3001,152 +3023,152 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, %edi -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: addl %esi, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ebp -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %esi, %ecx -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %ecx, %edx +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull %ebp, %eax +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -3163,27 +3185,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -3191,7 +3212,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: imull %eax, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp @@ -3215,242 +3236,244 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 104(%esi), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 104(%ecx), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 108(%esi), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 108(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 96(%esi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 100(%esi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl 96(%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 100(%esi), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 112(%ecx), %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 112(%esi), %edi +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: imull %edi, %ecx ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl 116(%ecx), %eax +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl 116(%esi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull %eax, %ebx ; X32-NEXT: addl %edx, %ebx -; X32-NEXT: movl 120(%ecx), %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ecx, %esi +; X32-NEXT: movl 120(%esi), %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %esi, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 124(%esi), %esi -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %edx, %esi +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 124(%ecx), %ecx +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: imull %ebp, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -3465,41 +3488,39 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: adcl %ebp, %edi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movzbl %bl, %edi ; X32-NEXT: adcl %edi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, %ebx ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -3516,7 +3537,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload @@ -3536,66 +3557,65 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 88(%esi), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 88(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 92(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 92(%esi), %ebx +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 80(%ecx), %ebx ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl 84(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 80(%esi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 84(%esi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: addl %esi, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -3607,130 +3627,130 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 72(%esi), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 72(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 76(%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl 76(%ecx), %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 64(%ebx), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 64(%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl 68(%ebx), %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl 68(%esi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %ecx, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -3738,137 +3758,139 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax -; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ecx -; X32-NEXT: mull %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %bl -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, %esi ; X32-NEXT: addl %ecx, %esi ; X32-NEXT: movzbl %bl, %eax @@ -3878,73 +3900,72 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %bl, %esi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X32-NEXT: adcl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %edi, %esi ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -3953,148 +3974,146 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %ebx, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: movl %esi, %ecx ; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -4107,198 +4126,199 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 96(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 96(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 100(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 100(%eax), %esi ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %esi, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 104(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl 104(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 108(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: movl 108(%eax), %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %edi, %esi -; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %edi, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: imull %ebp, %esi +; X32-NEXT: imull %edi, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4310,7 +4330,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull (%esp), %esi # 4-byte Folded Reload +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi ; X32-NEXT: movl 124(%ebx), %eax ; X32-NEXT: imull %ecx, %eax @@ -4333,139 +4353,140 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: setb %cl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %edi, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %edi ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4473,81 +4494,83 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %edi, %esi ; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %edx +; X32-NEXT: imull %ebx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: imull %esi, %edi -; X32-NEXT: addl %edx, %edi -; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %edi, %esi +; X32-NEXT: addl %edx, %esi +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4557,7 +4580,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: addl %edx, %ebx @@ -4571,50 +4594,49 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: imull %ebp, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, %ebx ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb %cl +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movl %eax, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, %ebp -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -4623,11 +4645,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4637,35 +4661,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4677,20 +4699,19 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl (%esp), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload @@ -4698,34 +4719,35 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, (%ecx) @@ -4771,22 +4793,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, 80(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, 84(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 88(%ecx) +; X32-NEXT: movl %ebp, 88(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, 92(%ecx) -; X32-NEXT: movl %ebp, 96(%ecx) -; X32-NEXT: movl %ebx, 100(%ecx) +; X32-NEXT: movl %ebx, 96(%ecx) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %eax, 100(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, 104(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, 108(%ecx) -; X32-NEXT: movl %edi, 112(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 116(%ecx) +; X32-NEXT: movl %eax, 112(%ecx) +; X32-NEXT: movl %edi, 116(%ecx) +; X32-NEXT: movl %edx, 120(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, 120(%ecx) -; X32-NEXT: movl %edx, 124(%ecx) +; X32-NEXT: movl %eax, 124(%ecx) ; X32-NEXT: addl $400, %esp # imm = 0x190 ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi @@ -4806,14 +4828,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq 40(%rdi), %rbx -; X64-NEXT: movq 32(%rdi), %r14 +; X64-NEXT: movq 32(%rdi), %r12 ; X64-NEXT: movq 56(%rdi), %r15 ; X64-NEXT: movq 48(%rdi), %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rsi), %r11 -; X64-NEXT: movq 8(%rsi), %r8 -; X64-NEXT: movq %rsi, %r12 +; X64-NEXT: movq 8(%rsi), %r14 +; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi @@ -4825,560 +4847,553 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r9, %rcx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r9, %r8 ; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r10, %rsi -; X64-NEXT: adcq %r9, %r13 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: movq %r12, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %r9, %r11 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r9 ; X64-NEXT: setb %r10b +; X64-NEXT: movq %rbx, %r11 ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r9, %rbx +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r9, %r15 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: adcq %r8, %rbx ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r13 +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 16(%r13), %r8 +; X64-NEXT: movq %r12, %r10 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r12), %r8 -; X64-NEXT: movq %r14, %r10 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdi, %r14 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rdi, %r12 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq 24(%r12), %rbp +; X64-NEXT: movq 24(%r13), %rbp ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: adcq %r9, %r12 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: addq %r12, %rax +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: adcq %r9, %r13 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r12, %r9 +; X64-NEXT: addq %r13, %r9 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %rbx, %rcx -; X64-NEXT: movq %rcx, (%rsp) # 8-byte Spill -; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: addq %r15, %r14 ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: movq %r12, (%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %r9 ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq %r13, %rdi +; X64-NEXT: adcq %rcx, %rdi ; X64-NEXT: setb %r10b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r11, %rax -; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %sil, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r9, %r15 -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rdi, %r11 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %r9, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r10b, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rdi, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq 16(%r8), %rsi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: mulq %rbp +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq 16(%r14), %r11 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 24(%r8), %r14 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 24(%r14), %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rcx, %r11 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r11, %rsi -; X64-NEXT: adcq %rdi, %rbx -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %r13, %r12 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq %rsi, %r15 +; X64-NEXT: setb %sil +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rbp, %r13 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rbx, %rdi -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r15, %rdi +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq (%r8), %r13 -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq (%r14), %rbp +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq 8(%r8), %rax -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r11, %r14 -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r12, %r11 -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq 8(%r14), %r14 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rsi, %r12 +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: addq %r12, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: adcq %r15, %rsi ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r8, %rbp -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r12, %rbx +; X64-NEXT: movq %r14, %r15 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %rsi, %r13 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: addq %r9, %rbx -; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: adcq %rax, %r12 +; X64-NEXT: addq %r9, %r13 +; X64-NEXT: adcq %rbx, %r12 ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r13, %r10 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rsi, %r14 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rsi, %rbx ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: adcq %r9, %r12 -; X64-NEXT: setb %r10b ; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: adcq %r9, %rbp +; X64-NEXT: setb %r9b +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r12, %rsi -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r11, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbp, %rsi +; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: addq %r13, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r12, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: addq %rdi, %rsi -; X64-NEXT: adcq %rcx, %r9 +; X64-NEXT: adcq %rcx, %r15 ; X64-NEXT: setb %r10b +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r8, %rdi +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %r11 +; X64-NEXT: addq %rcx, %r12 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rbp +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r12, %r11 +; X64-NEXT: adcq %rdi, %r13 +; X64-NEXT: setb %dil ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rbx, %rcx -; X64-NEXT: adcq %r11, %r14 -; X64-NEXT: setb %r11b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movzbl %r11b, %edi -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: adcq %r9, %rcx -; X64-NEXT: movzbl %r10b, %esi -; X64-NEXT: adcq %rsi, %rax +; X64-NEXT: movq %r8, %r9 +; X64-NEXT: mulq %r14 +; X64-NEXT: addq %r13, %rax +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: adcq %r15, %r11 +; X64-NEXT: movzbl %r10b, %ecx +; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq (%rsp), %rax # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: adcq (%rsp), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 32(%rcx), %rdi -; X64-NEXT: movq %r8, %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq 32(%r8), %rcx +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %rbx, %r14 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq 40(%rcx), %rsi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq 40(%r8), %rsi +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r15 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r11, %rsi -; X64-NEXT: adcq %r9, %rbx +; X64-NEXT: adcq %rdi, %r15 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbx, %r11 +; X64-NEXT: addq %r15, %r11 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rdi, %r10 -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: adcq %rax, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %r15, %rbx -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r15, %rbp +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rbx, %rcx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: addq %r13, %rax +; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill -; X64-NEXT: adcq %r14, %r10 -; X64-NEXT: setb %r15b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r10, %r14 -; X64-NEXT: movzbl %r15b, %eax -; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: adcq %rsi, %rbx +; X64-NEXT: adcq %r13, %r10 +; X64-NEXT: setb %bl +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r10, %rbp +; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: addq %r12, %rbp +; X64-NEXT: adcq %rsi, %r15 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: movq 48(%rcx), %rcx -; X64-NEXT: movq %rbp, %r15 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq 48(%r8), %rcx +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, %r12 ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rsi, %r13 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq 56(%r8), %rsi -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r13, %r12 -; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r13, %r9 +; X64-NEXT: adcq %r10, %r14 ; X64-NEXT: setb %r8b -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r15, %r13 +; X64-NEXT: addq %r14, %r13 ; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: movq %rbp, %r8 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: adcq %rbx, %rdi +; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: adcq %r15, %r9 ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: addq %r11, %r13 -; X64-NEXT: adcq %r9, %rsi -; X64-NEXT: setb %bpl -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: setb %r11b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r9, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rbx, %r9 -; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r14, %rbp +; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r15, %rbx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r8, %rdi ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r13, %r12 -; X64-NEXT: adcq %rsi, %r9 -; X64-NEXT: movzbl %bpl, %eax -; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: adcq $0, %r14 +; X64-NEXT: adcq %rsi, %rbp +; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: adcq %rax, %rdi +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: adcq $0, %r14 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq %rsi, %r14 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r10, %r12 -; X64-NEXT: adcq %rsi, %r15 -; X64-NEXT: setb %r8b -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r15, %rsi -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r10, %r13 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %r11 -; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: addq %r13, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %r10 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %r8b -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: movq %rbp, %r15 -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: addq %rdi, %rbp -; X64-NEXT: adcq %r12, %r13 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r11, %r8 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %r12 +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r10, %rsi +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r8, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r11, %r13 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: addq %r15, %rax -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r8, %r12 +; X64-NEXT: movq %r12, %r11 +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r8, %r13 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %rbp, %r11 -; X64-NEXT: adcq %r13, %r15 -; X64-NEXT: movq %r15, %rbp -; X64-NEXT: adcq $0, %r12 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: adcq %r9, %rdi +; X64-NEXT: adcq %rax, %r12 +; X64-NEXT: addq %rbp, %r13 +; X64-NEXT: adcq %rbx, %r12 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rbx +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r8, %r10 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r10, %r11 +; X64-NEXT: adcq %rbp, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r9, %rbp +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %r9 +; X64-NEXT: addq %r13, %r14 +; X64-NEXT: movq %r14, %r13 +; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %rsi, %rbx +; X64-NEXT: adcq %rcx, %r9 ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %rcx, %rsi -; X64-NEXT: mulq %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %r8, %rax ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %sil, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r12, %r10 -; X64-NEXT: adcq %rdi, %r8 -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %rbx, %r10 +; X64-NEXT: adcq %r9, %r8 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: adcq %rbx, %r11 +; X64-NEXT: adcq %rdi, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -5389,25 +5404,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 64(%r9), %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq 64(%r9), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdi, %r15 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rsi, %r8 ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq 72(%r9), %rsi -; X64-NEXT: movq %r9, %rcx -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r9, %r13 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r8, %rbx @@ -5415,470 +5429,475 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: setb %r8b ; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %r10, %r9 ; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %rsi +; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r15, %rdi +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %r8, %r14 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %r12 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r13 +; X64-NEXT: movq %r12, %rcx +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, %r13 -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %r15, %r12 +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %r8, %rbp ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: addq %r11, %rbp -; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: adcq %rbx, %r15 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: movq 80(%rcx), %r15 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq 80(%r13), %r14 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %r8, %r11 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq 88(%rbx), %rbx -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq 88(%r13), %rbx +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %r8, %r13 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: addq %rbp, %rsi +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: addq %r9, %r13 -; X64-NEXT: adcq %rsi, %r12 +; X64-NEXT: adcq %rdi, %r12 ; X64-NEXT: setb %bpl ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rdi, %r10 ; X64-NEXT: adcq $0, %r8 ; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %r15 ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: addq %r10, %rax -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: adcq %r8, %rdi ; X64-NEXT: setb %r8b ; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rcx, %r9 ; X64-NEXT: mulq %rbx ; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movzbl %r8b, %ecx -; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rdx ; X64-NEXT: addq %r13, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %bpl, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r12, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %bpl, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: imulq %rax, %rbx ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rbx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: imulq %rcx, %r15 -; X64-NEXT: addq %rdx, %r15 +; X64-NEXT: imulq %rcx, %r14 +; X64-NEXT: addq %rdx, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi +; X64-NEXT: imulq %rsi, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %r10, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %rsi, %rbx +; X64-NEXT: imulq %r11, %rbx ; X64-NEXT: addq %rdx, %rbx ; X64-NEXT: addq %r8, %rdi -; X64-NEXT: adcq %r15, %rbx -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: adcq %r14, %rbx +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r8, %r15 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r8, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r15, %r11 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r8, %r15 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r8, %r14 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: adcq %rbx, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 112(%r9), %rbx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq 112(%rcx), %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: imulq %rcx, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: movq 120(%r9), %rax -; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rdi, %rbp -; X64-NEXT: addq %rax, %rbx -; X64-NEXT: movq 96(%r9), %r10 -; X64-NEXT: movq 104(%r9), %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: imulq %rdi, %r12 ; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r12, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r10, %r14 -; X64-NEXT: addq %rdx, %r14 -; X64-NEXT: addq %r8, %r13 -; X64-NEXT: adcq %rbx, %r14 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: imulq %r11, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: movq 120(%rcx), %rax +; X64-NEXT: imulq %rdi, %rax +; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: addq %rax, %r10 +; X64-NEXT: movq 96(%rcx), %r13 +; X64-NEXT: movq 104(%rcx), %r8 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, %rbx +; X64-NEXT: imulq %r8, %rbx +; X64-NEXT: mulq %r13 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rbx, %rdx +; X64-NEXT: imulq %r13, %r9 +; X64-NEXT: addq %rdx, %r9 +; X64-NEXT: addq %rbp, %rdi +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: movq %r9, %r15 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r8, %r12 +; X64-NEXT: addq %r10, %r12 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r12, %rbx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r12, %r13 ; X64-NEXT: adcq %rbp, %r10 -; X64-NEXT: setb %r8b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: setb %bl +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: addq %r10, %rax -; X64-NEXT: movzbl %r8b, %edi -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq %r13, %rax -; X64-NEXT: adcq %r14, %rdx +; X64-NEXT: movzbl %bl, %r8d +; X64-NEXT: adcq %r8, %rdx +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %r15, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; X64-NEXT: adcq %r11, %rbx -; X64-NEXT: adcq %r15, %rax +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq %r14, %rax ; X64-NEXT: adcq %rsi, %rdx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: movq 80(%r13), %r8 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 88(%r13), %r11 -; X64-NEXT: movq %r13, %r10 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r9 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 80(%rdi), %r10 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rdi, %r14 -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %r8, %r11 -; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: movq %r10, %rdi -; X64-NEXT: movq 64(%r10), %r10 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 88(%rdi), %r15 +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %rbx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %r8 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 72(%rdi), %rax -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r9, %rdi +; X64-NEXT: adcq %r8, %rcx +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %r11, %r10 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %r12 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r11, %r9 -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r12, %rax +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: movq 64(%r14), %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq 72(%r14), %r8 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r11, %r14 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rbx, %r11 +; X64-NEXT: setb %cl +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbx, %rbp -; X64-NEXT: adcq %r14, %r12 -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %r10, %rdi -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r11, %rbp +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %rsi, %rbp +; X64-NEXT: adcq %rdi, %rbx +; X64-NEXT: adcq $0, %r12 +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %r9, %rcx +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, %rbx -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r10 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: adcq %r10, %rcx -; X64-NEXT: setb %dil -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r10 +; X64-NEXT: adcq %rdi, %r13 +; X64-NEXT: setb %cl +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r13, %rdi +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: addq %rbp, %r9 ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: adcq %rbx, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq %rsi, %r14 -; X64-NEXT: adcq %r13, %r10 -; X64-NEXT: setb %dil -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r9, %rax -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: setb %sil -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %sil, %ecx -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r14, %r12 -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r10, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %r12, %rdi +; X64-NEXT: adcq %r15, %rsi +; X64-NEXT: setb %cl +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r8, %r9 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: adcq %r11, %r13 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: addq %r13, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rsi, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq 96(%rdi), %rsi -; X64-NEXT: imulq %rsi, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq 96(%rcx), %rsi +; X64-NEXT: imulq %rsi, %r9 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %r8, %rcx -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r15, %rdx -; X64-NEXT: movq 104(%rdi), %r9 -; X64-NEXT: imulq %r9, %rcx -; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r9, %rdx +; X64-NEXT: movq 104(%rcx), %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: imulq %r9, %rax +; X64-NEXT: addq %rdx, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq 112(%rcx), %rax ; X64-NEXT: movq %rcx, %r14 -; X64-NEXT: movq 112(%rdi), %rax ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: imulq %r12, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: mulq %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: imulq %r10, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq 120(%rdi), %rdi -; X64-NEXT: imulq %r15, %rdi -; X64-NEXT: addq %rdx, %rdi -; X64-NEXT: addq %r10, %r8 -; X64-NEXT: adcq %r14, %rdi -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq 120(%r14), %r13 +; X64-NEXT: imulq %rbx, %r13 +; X64-NEXT: addq %rdx, %r13 +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: adcq %r11, %r13 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r10, %r13 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r13, %rbp -; X64-NEXT: adcq %r14, %rcx +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rbx, %r12 +; X64-NEXT: adcq %r11, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: addq %r8, %r9 +; X64-NEXT: adcq %r13, %rbx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: imulq %r15, %rdi -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; X64-NEXT: imulq %r10, %rdi +; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: imulq %r12, %rsi -; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq %rsi, %r8 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: imulq %r9, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r14, %rax +; X64-NEXT: addq %rdx, %rax ; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: imulq %r8, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: imulq %r11, %rbx -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: adcq %r8, %rbx -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: imulq %rdi, %rbp +; X64-NEXT: addq %rdx, %rbp +; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: adcq %r13, %rbp +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r8, %rsi -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: adcq %rdi, %r8 +; X64-NEXT: addq %rcx, %rsi +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: adcq %r8, %rdi ; X64-NEXT: setb %sil -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: addq %r8, %rax +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movzbl %sil, %esi ; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq %r13, %rax +; X64-NEXT: addq %r11, %rax +; X64-NEXT: adcq %rbp, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq %r12, %rcx +; X64-NEXT: adcq %r9, %rax ; X64-NEXT: adcq %rbx, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq %rbp, %r11 -; X64-NEXT: adcq %r14, %rax -; X64-NEXT: adcq %r10, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload @@ -5887,10 +5906,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload @@ -5898,9 +5917,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; X64-NEXT: movq %rdi, %r9 ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload @@ -5923,9 +5942,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq %r8, 64(%rsi) ; X64-NEXT: movq %r9, 72(%rsi) ; X64-NEXT: movq %r10, 80(%rsi) -; X64-NEXT: movq %rbx, 88(%rsi) -; X64-NEXT: movq %rcx, 96(%rsi) -; X64-NEXT: movq %r11, 104(%rsi) +; X64-NEXT: movq %r11, 88(%rsi) +; X64-NEXT: movq %r13, 96(%rsi) +; X64-NEXT: movq %rcx, 104(%rsi) ; X64-NEXT: movq %rax, 112(%rsi) ; X64-NEXT: movq %rdx, 120(%rsi) ; X64-NEXT: addq $240, %rsp diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll index 83e91063cf84f..6f6dde3aa3cf4 100644 --- a/llvm/test/CodeGen/X86/mul-i256.ll +++ b/llvm/test/CodeGen/X86/mul-i256.ll @@ -21,73 +21,73 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: .cfi_offset %edi, -16 ; X32-NEXT: .cfi_offset %ebx, -12 ; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 12(%ecx), %esi -; X32-NEXT: movl 8(%ecx), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%eax), %edi +; X32-NEXT: movl 12(%eax), %ebx +; X32-NEXT: movl 8(%eax), %ebp +; X32-NEXT: movl (%edx), %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, %ecx -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl 4(%eax), %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl (%edi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl (%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 4(%edi), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 4(%esi), %ebp +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %esi ; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -95,111 +95,115 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 8(%eax), %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ecx +; X32-NEXT: movl 12(%eax), %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl (%esp), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %edi -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: imull %edi, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl 16(%ecx), %esi +; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: imull %esi, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %edi, %edx ; X32-NEXT: movl 20(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull %eax, %ebp -; X32-NEXT: addl %edx, %ebp +; X32-NEXT: movl %ebx, %edi +; X32-NEXT: imull %eax, %edi +; X32-NEXT: addl %edx, %edi ; X32-NEXT: movl 24(%ecx), %eax ; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl 28(%ecx), %ecx -; X32-NEXT: imull %esi, %ecx +; X32-NEXT: imull %ebx, %ecx ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload @@ -219,80 +223,80 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 { ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 24(%edi), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl 24(%edi), %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %edx, %esi +; X32-NEXT: movl %edi, %edx ; X32-NEXT: movl 28(%edi), %eax -; X32-NEXT: imull %esi, %eax -; X32-NEXT: addl %eax, %ecx -; X32-NEXT: movl 16(%edi), %ebp -; X32-NEXT: movl 20(%edi), %ebx +; X32-NEXT: imull %ecx, %eax +; X32-NEXT: addl %eax, %esi +; X32-NEXT: movl 16(%edi), %edi +; X32-NEXT: movl 20(%edx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edi -; X32-NEXT: imull %ebx, %edi -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: imull %ebp, %ebx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %edi, %ecx +; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: mull %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, (%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 4(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 8(%ecx) -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 12(%ecx) -; X32-NEXT: movl %ebx, 16(%ecx) -; X32-NEXT: movl %esi, 20(%ecx) -; X32-NEXT: movl %eax, 24(%ecx) -; X32-NEXT: movl %edx, 28(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, (%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, 4(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, 8(%esi) +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, 12(%esi) +; X32-NEXT: movl %ecx, 16(%esi) +; X32-NEXT: movl %edi, 20(%esi) +; X32-NEXT: movl %eax, 24(%esi) +; X32-NEXT: movl %edx, 28(%esi) ; X32-NEXT: addl $72, %esp ; X32-NEXT: .cfi_def_cfa_offset 20 ; X32-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll index c39c8337ca455..4a0f0ad94cef0 100644 --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -9,71 +9,71 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $184, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: subl $180, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl 28(%edx), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 24(%edx), %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %ebx +; X32-NEXT: movl 24(%eax), %ebp +; X32-NEXT: movl (%edx), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %ecx +; X32-NEXT: movl 4(%eax), %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %ebp -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %ecx, %edi ; X32-NEXT: setb %cl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 20(%ecx), %ebx +; X32-NEXT: movl 16(%ecx), %ebx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 20(%ecx), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -81,211 +81,214 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 8(%edi), %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 8(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl 12(%edi), %ecx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 12(%eax), %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 8(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl 12(%ecx), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 12(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: mull %ebp ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl (%ecx), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl 4(%ecx), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl (%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %esi +; X32-NEXT: movl 4(%esi), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %edi +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %ecx, %ebp ; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %edi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X32-NEXT: adcl %edi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload +; X32-NEXT: adcl %ebx, %eax ; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -295,302 +298,300 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 16(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 16(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 20(%eax), %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebx ; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %ebx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 24(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 24(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ebx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: addl %ebp, %esi ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl $0, %eax +; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %edx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %eax -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %esi ; X32-NEXT: addl %edi, %esi -; X32-NEXT: movzbl %bl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: mull %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx +; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %eax -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -606,410 +607,411 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 32(%eax), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 32(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 36(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 36(%eax), %esi ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl %ecx, %ebp +; X32-NEXT: setb %cl ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, %esi ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 40(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl 40(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 44(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl 44(%edi), %edi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: setb %cl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebp -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: mull %ecx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %edi, %esi -; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: imull %edi, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: imull %ebp, %esi +; X32-NEXT: imull %ecx, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl (%esp), %eax # 4-byte Reload -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X32-NEXT: movl 56(%ebx), %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 56(%edi), %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 60(%ebx), %eax -; X32-NEXT: imull %ecx, %eax -; X32-NEXT: addl %eax, %esi -; X32-NEXT: movl 48(%ebx), %edi -; X32-NEXT: movl 52(%ebx), %ebp +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl 60(%edi), %eax +; X32-NEXT: imull %ebp, %eax +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: movl 48(%edi), %esi +; X32-NEXT: movl 52(%edi), %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: imull %ebp, %ebx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: imull %edi, %ebx +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %esi ; X32-NEXT: addl %ebx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %esi, %ebx +; X32-NEXT: addl %edx, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ecx, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: mull %edi +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 40(%edi), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 40(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 44(%edi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl 44(%ecx), %ebp ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: setb %cl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 32(%ecx), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 32(%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 36(%ecx), %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl 36(%esi), %ebp ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: setb %cl -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %edi, %ebp +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ecx -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 48(%ecx), %edi -; X32-NEXT: movl %ebx, %esi -; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 48(%esi), %edi +; X32-NEXT: imull %edi, %ecx ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: addl %esi, %edx -; X32-NEXT: movl 52(%ecx), %eax +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl 52(%esi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull %eax, %ebx ; X32-NEXT: addl %edx, %ebx -; X32-NEXT: movl 56(%ecx), %eax +; X32-NEXT: movl 56(%esi), %eax ; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: imull %ebp, %esi @@ -1025,78 +1027,77 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %edi -; X32-NEXT: setb %cl +; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ecx ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %edi +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %edi, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: imull %ebp, %ecx ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb %cl +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1107,7 +1108,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -1164,7 +1165,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X32-NEXT: movl %edi, 52(%ecx) ; X32-NEXT: movl %eax, 56(%ecx) ; X32-NEXT: movl %edx, 60(%ecx) -; X32-NEXT: addl $184, %esp +; X32-NEXT: addl $180, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -1181,200 +1182,200 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: pushq %rbx ; X64-NEXT: pushq %rax ; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rdi), %rbx -; X64-NEXT: movq 8(%rdi), %r9 -; X64-NEXT: movq 24(%rdi), %r12 -; X64-NEXT: movq 16(%rdi), %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: movq 8(%rdi), %rdi +; X64-NEXT: movq 24(%rax), %r14 +; X64-NEXT: movq 16(%rax), %rax +; X64-NEXT: movq (%rsi), %r8 ; X64-NEXT: movq 8(%rsi), %r11 -; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %rcx, %rsi -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %rax -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r15 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rcx, %r10 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r10, %rcx -; X64-NEXT: adcq %r8, %r14 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 +; X64-NEXT: adcq %r9, %rcx ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r14, %r10 -; X64-NEXT: adcq %rsi, %r13 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r14, %r15 -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: addq %r15, %rax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbp, %rbx +; X64-NEXT: adcq %rbx, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rbx, %rbp +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %r14 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; X64-NEXT: adcq %rcx, %r14 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%rdi), %r8 -; X64-NEXT: movq %r12, %r11 +; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: adcq %r15, %r14 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %r12 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r13, %rsi +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 16(%r13), %r10 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, %r12 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: adcq $0, %r15 ; X64-NEXT: movq 24(%rsi), %rsi -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r15, %r11 -; X64-NEXT: adcq %rbx, %r9 -; X64-NEXT: setb %bl +; X64-NEXT: addq %rbp, %r11 +; X64-NEXT: adcq %r15, %rcx +; X64-NEXT: setb %dil ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r9, %rcx -; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %rbp, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r14, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rcx +; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: addq %r10, %rcx -; X64-NEXT: adcq %r13, %r15 -; X64-NEXT: setb %r12b +; X64-NEXT: addq %r9, %rbp +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: setb %dil +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbx, %rax ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %r9, %rbp -; X64-NEXT: setb %dil -; X64-NEXT: movq %r10, %rax +; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movzbl %dil, %edi -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %rbp, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r15, %rbx ; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r12b, %ecx -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 32(%rcx), %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 32(%rdi), %r15 ; X64-NEXT: imulq %r15, %rsi ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq 40(%rcx), %rsi -; X64-NEXT: imulq %rsi, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: movq 48(%rcx), %rax -; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: movq 40(%rdi), %rsi +; X64-NEXT: imulq %rsi, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: movq 48(%rdi), %rax +; X64-NEXT: movq %rdi, %r8 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: imulq %r14, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: imulq %r9, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: movq 56(%r11), %r11 -; X64-NEXT: imulq %rbx, %r11 -; X64-NEXT: addq %rdx, %r11 -; X64-NEXT: addq %r9, %rcx -; X64-NEXT: adcq %r8, %r11 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, %r8 +; X64-NEXT: movq 56(%r8), %r8 +; X64-NEXT: imulq %r11, %r8 +; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rcx, %r15 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: adcq %r9, %r15 +; X64-NEXT: addq %r15, %r13 +; X64-NEXT: adcq %rdi, %rcx ; X64-NEXT: setb %dil -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r15, %r8 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rcx, %r10 ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: adcq %r11, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: movq 48(%r9), %rsi +; X64-NEXT: addq %rbx, %r10 +; X64-NEXT: adcq %r8, %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq 48(%r8), %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rsi @@ -1382,68 +1383,69 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: imulq %r14, %rsi ; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq %r9, %rdx -; X64-NEXT: movq 56(%r9), %rax +; X64-NEXT: movq %r8, %rdx +; X64-NEXT: movq 56(%r8), %rax ; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movq %rdi, %r8 ; X64-NEXT: addq %rax, %rsi -; X64-NEXT: movq 32(%r9), %r9 -; X64-NEXT: movq 40(%rdx), %r15 +; X64-NEXT: movq 32(%rdx), %rbp +; X64-NEXT: movq 40(%rdx), %r9 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: imulq %r15, %r11 -; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r11, %rdx -; X64-NEXT: imulq %r9, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: adcq %rsi, %r10 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: imulq %r9, %rdi +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: adcq $0, %rbp +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: imulq %rbp, %r11 +; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rbx, %r9 -; X64-NEXT: adcq %rbp, %rsi -; X64-NEXT: setb %bl -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: adcq %r15, %rcx +; X64-NEXT: setb %dil +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r14 -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movzbl %bl, %esi -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: adcq %r10, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq %r13, %r9 -; X64-NEXT: adcq %r8, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: adcq %r11, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq %r13, %r8 +; X64-NEXT: adcq %r10, %rax ; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload +; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, (%rsi) +; X64-NEXT: movq %rdi, (%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 8(%rsi) +; X64-NEXT: movq %rdi, 8(%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 16(%rsi) +; X64-NEXT: movq %rdi, 16(%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, 24(%rsi) -; X64-NEXT: movq %rcx, 32(%rsi) -; X64-NEXT: movq %r9, 40(%rsi) -; X64-NEXT: movq %rax, 48(%rsi) -; X64-NEXT: movq %rdx, 56(%rsi) +; X64-NEXT: movq %rdi, 24(%rcx) +; X64-NEXT: movq %rsi, 32(%rcx) +; X64-NEXT: movq %r8, 40(%rcx) +; X64-NEXT: movq %rax, 48(%rcx) +; X64-NEXT: movq %rdx, 56(%rcx) ; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll index 4e63580beb148..fc1cc1f65627a 100644 --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -24,71 +24,70 @@ define i128 @foo(i128 %t, i128 %u) { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $12, %esp -; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 28 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl %eax, %esi ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %ebp, %eax +; X86-NEXT: imull %esi, %eax ; X86-NEXT: addl %eax, %ebx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %esi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: imull %ecx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: imull %ebp, %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 4(%esi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, (%esi) -; X86-NEXT: movl %eax, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl %esi, (%ecx) +; X86-NEXT: movl %eax, 8(%ecx) +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll index 38082c94a727f..8b75c6fb68c78 100644 --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -17,29 +17,29 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: movq %rdi, %r10 ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: movq %rcx, %r8 -; CHECK-NEXT: imulq %rdx, %r8 +; CHECK-NEXT: movq %rcx, %rdi +; CHECK-NEXT: imulq %rdx, %rdi ; CHECK-NEXT: movq %r11, %rax ; CHECK-NEXT: mulq %rdx -; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: movq %rdx, %r9 ; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: addq %rax, %rdi -; CHECK-NEXT: addq %r8, %rdi +; CHECK-NEXT: addq %rax, %r9 +; CHECK-NEXT: addq %rdi, %r9 ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: sarq $63, %rax ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: imulq %rsi, %r14 ; CHECK-NEXT: mulq %r10 -; CHECK-NEXT: movq %rax, %r9 -; CHECK-NEXT: movq %rdx, %r8 -; CHECK-NEXT: addq %r14, %r8 -; CHECK-NEXT: addq %rax, %r8 -; CHECK-NEXT: addq %rbx, %r9 -; CHECK-NEXT: adcq %rdi, %r8 +; CHECK-NEXT: movq %rax, %r8 +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: addq %r14, %rdi +; CHECK-NEXT: addq %rax, %rdi +; CHECK-NEXT: addq %rbx, %r8 +; CHECK-NEXT: adcq %r9, %rdi ; CHECK-NEXT: movq %r10, %rax ; CHECK-NEXT: mulq %r11 ; CHECK-NEXT: movq %rdx, %rbx -; CHECK-NEXT: movq %rax, %rdi +; CHECK-NEXT: movq %rax, %r9 ; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: mulq %r11 ; CHECK-NEXT: movq %rdx, %r11 @@ -58,8 +58,8 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: mulq %rcx ; CHECK-NEXT: addq %rbx, %rax ; CHECK-NEXT: adcq %r11, %rdx -; CHECK-NEXT: addq %r9, %rax -; CHECK-NEXT: adcq %r8, %rdx +; CHECK-NEXT: addq %r8, %rax +; CHECK-NEXT: adcq %rdi, %rdx ; CHECK-NEXT: movq %r10, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: xorq %rcx, %rdx @@ -67,7 +67,7 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou ; CHECK-NEXT: orq %rdx, %rcx ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %nooverflow -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: movq %r10, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll index 1756154272018..ce672a70b1f91 100644 --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -45,12 +45,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; LINUX-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; LINUX-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; LINUX-NEXT: movq %r9, %r14 -; LINUX-NEXT: movq %r8, %r15 -; LINUX-NEXT: movq %rcx, %r12 -; LINUX-NEXT: movq %rdx, %r13 -; LINUX-NEXT: movq %rsi, %rbp +; LINUX-NEXT: movl %eax, %ebp +; LINUX-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; LINUX-NEXT: movq %r8, %r14 +; LINUX-NEXT: movq %rcx, %r15 +; LINUX-NEXT: movq %rdx, %r12 +; LINUX-NEXT: movq %rsi, %r13 ; LINUX-NEXT: movq %rdi, %rbx ; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) @@ -78,12 +78,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-NEXT: callq get_f@PLT ; LINUX-NEXT: movq %rax, %r11 ; LINUX-NEXT: movq %rbx, %rdi -; LINUX-NEXT: movq %rbp, %rsi -; LINUX-NEXT: movq %r13, %rdx -; LINUX-NEXT: movq %r12, %rcx -; LINUX-NEXT: movq %r15, %r8 -; LINUX-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; LINUX-NEXT: movq %r14, %r9 +; LINUX-NEXT: movq %r13, %rsi +; LINUX-NEXT: movq %r12, %rdx +; LINUX-NEXT: movq %r15, %rcx +; LINUX-NEXT: movq %r14, %r8 +; LINUX-NEXT: movl %ebp, %eax +; LINUX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; LINUX-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -138,12 +138,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-X32-NEXT: movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; LINUX-X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; LINUX-X32-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; LINUX-X32-NEXT: movq %r9, %r14 -; LINUX-X32-NEXT: movq %r8, %r15 -; LINUX-X32-NEXT: movq %rcx, %r12 -; LINUX-X32-NEXT: movq %rdx, %r13 -; LINUX-X32-NEXT: movq %rsi, %rbp +; LINUX-X32-NEXT: movl %eax, %ebp +; LINUX-X32-NEXT: movq %r9, {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Spill +; LINUX-X32-NEXT: movq %r8, %r14 +; LINUX-X32-NEXT: movq %rcx, %r15 +; LINUX-X32-NEXT: movq %rdx, %r12 +; LINUX-X32-NEXT: movq %rsi, %r13 ; LINUX-X32-NEXT: movq %rdi, %rbx ; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) @@ -171,12 +171,12 @@ define void @f_thunk(ptr %this, ...) { ; LINUX-X32-NEXT: callq get_f@PLT ; LINUX-X32-NEXT: movl %eax, %r11d ; LINUX-X32-NEXT: movq %rbx, %rdi -; LINUX-X32-NEXT: movq %rbp, %rsi -; LINUX-X32-NEXT: movq %r13, %rdx -; LINUX-X32-NEXT: movq %r12, %rcx -; LINUX-X32-NEXT: movq %r15, %r8 -; LINUX-X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; LINUX-X32-NEXT: movq %r14, %r9 +; LINUX-X32-NEXT: movq %r13, %rsi +; LINUX-X32-NEXT: movq %r12, %rdx +; LINUX-X32-NEXT: movq %r15, %rcx +; LINUX-X32-NEXT: movq %r14, %r8 +; LINUX-X32-NEXT: movl %ebp, %eax +; LINUX-X32-NEXT: movq {{[-0-9]+}}(%e{{[sb]}}p), %r9 # 8-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload ; LINUX-X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 529e0ad24936a..f0fb89496aa1e 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1396,72 +1396,72 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu 64(%rdi), %xmm1 +; SSE2-NEXT: movdqu 64(%rdi), %xmm2 ; SSE2-NEXT: movups 80(%rdi), %xmm4 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm2 -; SSE2-NEXT: movups 32(%rdi), %xmm5 -; SSE2-NEXT: movdqu 48(%rdi), %xmm3 -; SSE2-NEXT: movaps %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[2,0] +; SSE2-NEXT: movdqu 16(%rdi), %xmm3 +; SSE2-NEXT: movups 32(%rdi), %xmm6 +; SSE2-NEXT: movdqu 48(%rdi), %xmm1 +; SSE2-NEXT: movaps %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[2,0] ; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm5[2,0] -; SSE2-NEXT: movaps %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm6[2,0] +; SSE2-NEXT: movaps %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[2,0] -; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[2,0] +; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm2[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm3[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0] ; SSE2-NEXT: movups %xmm10, 16(%rsi) ; SSE2-NEXT: movups %xmm8, (%rsi) -; SSE2-NEXT: movups %xmm3, 16(%rdx) +; SSE2-NEXT: movups %xmm1, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) ; SSE2-NEXT: movups %xmm9, 16(%rcx) -; SSE2-NEXT: movups %xmm7, (%rcx) +; SSE2-NEXT: movups %xmm5, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: ; SSE42: # %bb.0: ; SSE42-NEXT: movups 80(%rdi), %xmm0 ; SSE42-NEXT: movdqu 64(%rdi), %xmm1 -; SSE42-NEXT: movdqu (%rdi), %xmm3 +; SSE42-NEXT: movdqu (%rdi), %xmm4 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2 -; SSE42-NEXT: movups 32(%rdi), %xmm4 +; SSE42-NEXT: movups 32(%rdi), %xmm3 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 ; SSE42-NEXT: movdqa %xmm2, %xmm6 -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[1] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1] ; SSE42-NEXT: movdqa %xmm1, %xmm8 ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] ; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3] ; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1] -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3] ; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,3] +; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7] ; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3] ; SSE42-NEXT: movups %xmm5, 16(%rsi) -; SSE42-NEXT: movups %xmm3, (%rsi) +; SSE42-NEXT: movups %xmm4, (%rsi) ; SSE42-NEXT: movdqu %xmm10, 16(%rdx) ; SSE42-NEXT: movdqu %xmm6, (%rdx) ; SSE42-NEXT: movups %xmm9, 16(%rcx) @@ -1635,38 +1635,38 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movups (%rsi), %xmm1 ; SSE2-NEXT: movups 16(%rsi), %xmm0 -; SSE2-NEXT: movups (%rdx), %xmm2 +; SSE2-NEXT: movups (%rdx), %xmm3 ; SSE2-NEXT: movups 16(%rdx), %xmm5 ; SSE2-NEXT: movups (%rcx), %xmm4 -; SSE2-NEXT: movups 16(%rcx), %xmm6 -; SSE2-NEXT: movaps %xmm4, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[1,3] -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] -; SSE2-NEXT: movaps %xmm0, %xmm7 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE2-NEXT: movaps %xmm6, %xmm8 -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] -; SSE2-NEXT: movaps %xmm0, %xmm9 +; SSE2-NEXT: movups 16(%rcx), %xmm7 +; SSE2-NEXT: movaps %xmm4, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[1,3] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0,2] +; SSE2-NEXT: movaps %xmm0, %xmm8 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] +; SSE2-NEXT: movaps %xmm7, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[1,3] +; SSE2-NEXT: movaps %xmm0, %xmm6 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm5[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] -; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm5[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[0,2] +; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[0,2] ; SSE2-NEXT: movaps %xmm1, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE2-NEXT: movups %xmm4, 16(%rdi) -; SSE2-NEXT: movups %xmm9, 48(%rdi) -; SSE2-NEXT: movups %xmm6, 64(%rdi) -; SSE2-NEXT: movups %xmm3, (%rdi) +; SSE2-NEXT: movups %xmm6, 48(%rdi) +; SSE2-NEXT: movups %xmm7, 64(%rdi) +; SSE2-NEXT: movups %xmm2, (%rdi) ; SSE2-NEXT: movups %xmm1, 32(%rdi) ; SSE2-NEXT: movups %xmm0, 80(%rdi) ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index f57defc368d5e..2f557679a1558 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -157,35 +157,35 @@ define void @PR42833() { ; SSE2: # %bb.0: ; SSE2-NEXT: movl b(%rip), %eax ; SSE2-NEXT: movdqa c+128(%rip), %xmm0 -; SSE2-NEXT: movdqa c+144(%rip), %xmm1 +; SSE2-NEXT: movdqa c+144(%rip), %xmm2 ; SSE2-NEXT: addl c+128(%rip), %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: movdqa d+144(%rip), %xmm4 -; SSE2-NEXT: psubd %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm2, %xmm4 +; SSE2-NEXT: paddd %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] -; SSE2-NEXT: movdqa %xmm1, c+144(%rip) +; SSE2-NEXT: movdqa %xmm2, c+144(%rip) ; SSE2-NEXT: movaps %xmm5, c+128(%rip) -; SSE2-NEXT: movdqa c+160(%rip), %xmm1 +; SSE2-NEXT: movdqa c+160(%rip), %xmm2 ; SSE2-NEXT: movdqa c+176(%rip), %xmm3 ; SSE2-NEXT: movdqa d+160(%rip), %xmm5 ; SSE2-NEXT: movdqa d+176(%rip), %xmm6 ; SSE2-NEXT: movdqa d+128(%rip), %xmm7 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: psubd %xmm0, %xmm7 ; SSE2-NEXT: psubd %xmm3, %xmm6 -; SSE2-NEXT: psubd %xmm1, %xmm5 +; SSE2-NEXT: psubd %xmm2, %xmm5 ; SSE2-NEXT: movdqa %xmm5, d+160(%rip) ; SSE2-NEXT: movdqa %xmm6, d+176(%rip) ; SSE2-NEXT: movdqa %xmm4, d+144(%rip) ; SSE2-NEXT: movdqa %xmm7, d+128(%rip) ; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, c+160(%rip) +; SSE2-NEXT: paddd %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, c+160(%rip) ; SSE2-NEXT: movdqa %xmm3, c+176(%rip) ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index b0739b6f47458..4b398095b549d 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: imull %esi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: imull %ebp, %ecx ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill +; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: movl %eax, %ebp -; CHECK-NEXT: sarl $31, %ebp -; CHECK-NEXT: shrl $30, %ebp -; CHECK-NEXT: addl %eax, %ebp -; CHECK-NEXT: sarl $2, %ebp -; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: sarl $31, %eax +; CHECK-NEXT: shrl $30, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: sarl $2, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: testl %ebp, %ebp ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_4: ## %bb6 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx -; CHECK-NEXT: movb %bl, (%edx,%edi) -; CHECK-NEXT: incl %edi -; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx +; CHECK-NEXT: movb %bl, (%edx,%esi) +; CHECK-NEXT: incl %esi +; CHECK-NEXT: cmpl %ebp, %esi ; CHECK-NEXT: jl LBB0_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %esi, %edx -; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: addl %ebp, %edx +; CHECK-NEXT: cmpl %edi, %ecx ; CHECK-NEXT: je LBB0_12 ; CHECK-NEXT: ## %bb.6: ## %bb7.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: jmp LBB0_4 ; CHECK-NEXT: LBB0_12: ## %bb18.loopexit -; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload -; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) +; CHECK-NEXT: cmpl $1, %edi ; CHECK-NEXT: jle LBB0_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, %esi +; CHECK-NEXT: cmpl $2, %ebp ; CHECK-NEXT: jl LBB0_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl %esi, %ebp -; CHECK-NEXT: shrl $31, %ebp -; CHECK-NEXT: addl %esi, %ebp -; CHECK-NEXT: sarl %ebp +; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: addl %ebp, %edx +; CHECK-NEXT: sarl %edx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $31, %ecx @@ -84,12 +84,12 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: addl $2, %edx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: addl $2, %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload +; CHECK-NEXT: addl %esi, %ecx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_9: ## %bb13 @@ -97,90 +97,89 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: ## Child Loop BB0_10 Depth 2 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: addl %edx, %edi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %esi, %edi ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx -; CHECK-NEXT: movb %dl, (%ecx,%ebx) -; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx -; CHECK-NEXT: movb %dl, (%eax,%ebx) -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: cmpl %ebp, %ebx +; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%ecx,%esi) +; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%eax,%esi) +; CHECK-NEXT: incl %esi +; CHECK-NEXT: cmpl %edx, %esi ; CHECK-NEXT: jl LBB0_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi -; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; CHECK-NEXT: addl $2, %edx -; CHECK-NEXT: addl %ebp, %ecx +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl $2, %esi +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: cmpl $1, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cmpl $1, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %ecx +; CHECK-NEXT: cmpl $3, %eax ; CHECK-NEXT: jne LBB0_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jle LBB0_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph -; CHECK-NEXT: leal 15(%edx), %eax +; CHECK-NEXT: leal 15(%edi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebp, %ebp -; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: addl %edi, %ecx -; CHECK-NEXT: addl %ecx, %ebp -; CHECK-NEXT: addl %eax, %ebx -; CHECK-NEXT: leal 15(%esi), %eax +; CHECK-NEXT: addl %ebx, %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload +; CHECK-NEXT: addl %esi, %ecx +; CHECK-NEXT: addl %ecx, %ebx +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: leal 15(%ebp), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: movl %ebp, %edi -; CHECK-NEXT: movl %ebx, %ebp +; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: movl %ebx, %esi ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %ebp, %ebx -; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %esi, %ebp -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edx +; CHECK-NEXT: addl %ebp, %ebx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edi ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 -; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: addl %ecx, %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 -; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 -; CHECK-NEXT: leal 15(%esi), %eax +; CHECK-NEXT: movl %edi, %esi +; CHECK-NEXT: leal 15(%ebp), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -188,32 +187,30 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: LBB0_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: pushl %edi -; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %ebp, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %esi, %edi -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edx +; CHECK-NEXT: addl %ebp, %edi +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; CHECK-NEXT: decl %esi ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb33 -; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: LBB0_23: ## %bb33 -; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: sarl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %ecx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB0_25: ## %return diff --git a/llvm/test/CodeGen/X86/overflow.ll b/llvm/test/CodeGen/X86/overflow.ll index b98ebdeb6b890..5900e7674cd0e 100644 --- a/llvm/test/CodeGen/X86/overflow.ll +++ b/llvm/test/CodeGen/X86/overflow.ll @@ -10,23 +10,23 @@ define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: mull %ecx diff --git a/llvm/test/CodeGen/X86/pr43820.ll b/llvm/test/CodeGen/X86/pr43820.ll index 7214ec75b0cf7..2fb4410567be6 100644 --- a/llvm/test/CodeGen/X86/pr43820.ll +++ b/llvm/test/CodeGen/X86/pr43820.ll @@ -20,9 +20,9 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %r12 ; CHECK-NEXT: movq %r12, %r10 ; CHECK-NEXT: shrq $4, %r10 -; CHECK-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F -; CHECK-NEXT: andq %rax, %r10 -; CHECK-NEXT: andq %rax, %r12 +; CHECK-NEXT: movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: andq %rsi, %r12 ; CHECK-NEXT: shlq $4, %r12 ; CHECK-NEXT: orq %r10, %r12 ; CHECK-NEXT: movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333 @@ -36,14 +36,13 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r13, %rbp ; CHECK-NEXT: shrq %r12 ; CHECK-NEXT: andq %r13, %r12 -; CHECK-NEXT: leaq (%r12,%rbp,2), %rsi -; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leaq (%r12,%rbp,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %r14 ; CHECK-NEXT: movq %r14, %r12 ; CHECK-NEXT: shrq $4, %r12 -; CHECK-NEXT: movq %rax, %rbp -; CHECK-NEXT: andq %rax, %r12 -; CHECK-NEXT: andq %rax, %r14 +; CHECK-NEXT: andq %rsi, %r12 +; CHECK-NEXT: andq %rsi, %r14 ; CHECK-NEXT: shlq $4, %r14 ; CHECK-NEXT: orq %r12, %r14 ; CHECK-NEXT: movq %r14, %r12 @@ -61,8 +60,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %r15 ; CHECK-NEXT: movq %r15, %r12 ; CHECK-NEXT: shrq $4, %r12 -; CHECK-NEXT: andq %rbp, %r12 -; CHECK-NEXT: andq %rbp, %r15 +; CHECK-NEXT: andq %rsi, %r12 +; CHECK-NEXT: andq %rsi, %r15 ; CHECK-NEXT: shlq $4, %r15 ; CHECK-NEXT: orq %r12, %r15 ; CHECK-NEXT: movq %r15, %r12 @@ -79,8 +78,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rbx ; CHECK-NEXT: movq %rbx, %r15 ; CHECK-NEXT: shrq $4, %r15 -; CHECK-NEXT: andq %rbp, %r15 -; CHECK-NEXT: andq %rbp, %rbx +; CHECK-NEXT: andq %rsi, %r15 +; CHECK-NEXT: andq %rsi, %rbx ; CHECK-NEXT: shlq $4, %rbx ; CHECK-NEXT: orq %r15, %rbx ; CHECK-NEXT: movq %rbx, %r15 @@ -97,8 +96,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -116,8 +115,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -135,8 +134,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -154,8 +153,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -173,8 +172,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -192,8 +191,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: shrq $4, %rbx -; CHECK-NEXT: andq %rbp, %rbx -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rbx +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rbx, %rdi ; CHECK-NEXT: movq %rdi, %rbx @@ -211,8 +210,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %rdi ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %rdi +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rdi ; CHECK-NEXT: shlq $4, %rdi ; CHECK-NEXT: orq %rax, %rdi ; CHECK-NEXT: movq %rdi, %rax @@ -224,13 +223,12 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %rdi ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rdi,2), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: leaq (%rax,%rdi,2), %rdi ; CHECK-NEXT: bswapq %r9 ; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %r9 +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %r9 ; CHECK-NEXT: shlq $4, %r9 ; CHECK-NEXT: orq %rax, %r9 ; CHECK-NEXT: movq %r9, %rax @@ -247,8 +245,8 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: bswapq %r8 ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %r8 +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %r8 ; CHECK-NEXT: shlq $4, %r8 ; CHECK-NEXT: orq %rax, %r8 ; CHECK-NEXT: movq %r8, %rax @@ -260,12 +258,13 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %r8 ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%r8,2), %r8 +; CHECK-NEXT: leaq (%rax,%r8,2), %rax +; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: bswapq %rcx ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %rcx +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rcx ; CHECK-NEXT: shlq $4, %rcx ; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: movq %rcx, %rax @@ -277,12 +276,12 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %rcx ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rcx,2), %r12 +; CHECK-NEXT: leaq (%rax,%rcx,2), %rbx ; CHECK-NEXT: bswapq %rdx ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: andq %rbp, %rdx +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rdx ; CHECK-NEXT: shlq $4, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %rax @@ -294,28 +293,28 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: andq %r14, %rdx ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%rdx,2), %rdi -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: bswapq %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $4, %rcx -; CHECK-NEXT: andq %rbp, %rcx -; CHECK-NEXT: andq %rbp, %rax -; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: andq %r10, %rcx -; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: leaq (%rax,%rdx,2), %rdx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-NEXT: bswapq %rcx +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: shrq $4, %rax +; CHECK-NEXT: andq %rsi, %rax +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: shlq $4, %rcx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: andq %r10, %rax -; CHECK-NEXT: leaq (%rax,%rcx,4), %rax -; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: andq %r14, %r10 +; CHECK-NEXT: shrq $2, %rcx +; CHECK-NEXT: andq %r10, %rcx +; CHECK-NEXT: leaq (%rcx,%rax,4), %rax +; CHECK-NEXT: movq %rax, %rsi +; CHECK-NEXT: andq %r14, %rsi ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andq %r14, %rax -; CHECK-NEXT: leaq (%rax,%r10,2), %rdx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-NEXT: leaq (%rax,%rsi,2), %rsi +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rax, %rsi +; CHECK-NEXT: shrdq $24, %rax, %r10 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: shrdq $24, %rcx, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -324,48 +323,46 @@ define i1000 @square(i1000 %A) nounwind { ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r13, %rbp +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; CHECK-NEXT: shrdq $24, %r12, %r13 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r15, %r13 +; CHECK-NEXT: shrdq $24, %r15, %r12 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; CHECK-NEXT: shrdq $24, %r14, %r15 -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rbx, %r14 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r11, %rbx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r10, %r11 +; CHECK-NEXT: shrdq $24, %r11, %r14 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; CHECK-NEXT: shrdq $24, %r9, %r10 +; CHECK-NEXT: shrdq $24, %r9, %r11 +; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: shrdq $24, %rdi, %r9 +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-NEXT: shrdq $24, %rdi, %r8 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-NEXT: shrdq $24, %rcx, %r9 -; CHECK-NEXT: movq %r8, %rax -; CHECK-NEXT: shrdq $24, %r8, %rcx -; CHECK-NEXT: movq %rcx, %r8 -; CHECK-NEXT: shrdq $24, %r12, %rax -; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrdq $24, %rdi, %r12 -; CHECK-NEXT: shrdq $24, %rdx, %rdi +; CHECK-NEXT: shrdq $24, %rcx, %rdi +; CHECK-NEXT: shrdq $24, %rbx, %rcx +; CHECK-NEXT: shrdq $24, %rdx, %rbx +; CHECK-NEXT: shrdq $24, %rsi, %rdx ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; CHECK-NEXT: movq %rdi, 112(%rax) -; CHECK-NEXT: movq %r12, 104(%rax) +; CHECK-NEXT: movq %rdx, 112(%rax) +; CHECK-NEXT: movq %rbx, 104(%rax) ; CHECK-NEXT: movq %rcx, 96(%rax) -; CHECK-NEXT: movq %r8, 88(%rax) -; CHECK-NEXT: movq %r9, 80(%rax) -; CHECK-NEXT: movq %r10, 72(%rax) +; CHECK-NEXT: movq %rdi, 88(%rax) +; CHECK-NEXT: movq %r8, 80(%rax) +; CHECK-NEXT: movq %r9, 72(%rax) ; CHECK-NEXT: movq %r11, 64(%rax) -; CHECK-NEXT: movq %rbx, 56(%rax) -; CHECK-NEXT: movq %r14, 48(%rax) -; CHECK-NEXT: movq %r15, 40(%rax) +; CHECK-NEXT: movq %r14, 56(%rax) +; CHECK-NEXT: movq %r15, 48(%rax) +; CHECK-NEXT: movq %r12, 40(%rax) ; CHECK-NEXT: movq %r13, 32(%rax) ; CHECK-NEXT: movq %rbp, 24(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 16(%rax) ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-NEXT: movq %rcx, 8(%rax) -; CHECK-NEXT: movq %rsi, (%rax) -; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: shrq $56, %rdx -; CHECK-NEXT: movb %dl, 124(%rax) +; CHECK-NEXT: movq %r10, (%rax) +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: shrq $56, %rsi +; CHECK-NEXT: movb %sil, 124(%rax) ; CHECK-NEXT: shrq $24, %rcx ; CHECK-NEXT: movl %ecx, 120(%rax) ; CHECK-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/pr46527.ll b/llvm/test/CodeGen/X86/pr46527.ll index 5ae953ab82ab4..ab454cfe470f3 100644 --- a/llvm/test/CodeGen/X86/pr46527.ll +++ b/llvm/test/CodeGen/X86/pr46527.ll @@ -7,11 +7,11 @@ define void @f(ptr %out, <16 x i8> %in, i1 %flag) { ; CHECK-NEXT: calll .L0$pb ; CHECK-NEXT: .cfi_adjust_cfa_offset 4 ; CHECK-NEXT: .L0$pb: -; CHECK-NEXT: popl %ecx +; CHECK-NEXT: popl %eax ; CHECK-NEXT: .cfi_adjust_cfa_offset -4 ; CHECK-NEXT: .Ltmp0: -; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: notb %dl ; CHECK-NEXT: andb $1, %dl @@ -22,8 +22,8 @@ define void @f(ptr %out, <16 x i8> %in, i1 %flag) { ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; CHECK-NEXT: paddb %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%ecx), %xmm1 -; CHECK-NEXT: movdqa %xmm1, (%eax) +; CHECK-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%eax), %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%ecx) ; CHECK-NEXT: retl entry: %0 = select i1 %flag, i8 0, i8 2 diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll index 798663c1d4dca..56618205ec7c1 100644 --- a/llvm/test/CodeGen/X86/pr46877.ll +++ b/llvm/test/CodeGen/X86/pr46877.ll @@ -5,200 +5,201 @@ define void @tester(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15, float %16, float %17, float %18, float %19, float %20, float %21, float %22, float %23, float %24, float %25, float %26, float %27, float %28, float %29, float %30, float %31, float %32, float %33, float %34, float %35, float %36, float %37, float %38, float %39, float %40, float %41, float %42, float %43, float %44, float %45, float %46, float %47, float %48, float %49, float %50, float %51, float %52, float %53, float %54, float %55, float %56, float %57, float %58, float %59, float %60, float %61, float %62, float %63, float %64, float %65, float %66, float %67, float %68, float %69, float %70, float %71, float %72, float %73, float %74, float %75, float %76, float %77, float %78, float %79, ptr %80) { ; CHECK-LABEL: tester: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps %xmm3, %xmm15 -; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovaps %xmm3, %xmm13 +; CHECK-NEXT: vmovaps %xmm1, %xmm14 +; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero ; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm12 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm10 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm10 = (xmm3 * xmm10) - xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm10, %xmm4 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss %xmm6, %xmm12, %xmm2 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm5 -; CHECK-NEXT: vmulss %xmm0, %xmm13, %xmm2 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm2 ; CHECK-NEXT: vmovss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm8, %xmm2 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm14 * xmm4) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm5 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm9 * xmm5) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm11 * xmm4) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm4, %xmm4 +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm5 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm8 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm8 -; CHECK-NEXT: vmovss %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmovaps %xmm5, %xmm10 -; CHECK-NEXT: vmulss %xmm14, %xmm8, %xmm5 +; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: vmovaps %xmm5, %xmm15 +; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm5 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm13 * xmm5) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm9 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm11 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm9 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm11 * xmm3) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm9 * xmm3) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm15 * xmm3) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm4 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm6 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm6, %xmm4 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm6, %xmm4 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm14, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm10 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm5, %xmm10, %xmm5 +; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm14 * xmm5) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm5, %xmm10, %xmm5 +; CHECK-NEXT: vmulss %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm1 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm4 -; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm5 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm10 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm1 * xmm10) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm10, %xmm3 +; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm15 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm10 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm13, %xmm12 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm12 = -(xmm7 * xmm12) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm12 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm12 = -(xmm6 * xmm12) + xmm0 ; CHECK-NEXT: vmulss %xmm10, %xmm12, %xmm10 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vmulss %xmm4, %xmm10, %xmm12 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmulss %xmm3, %xmm10, %xmm12 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm6 * xmm2) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm5, %xmm3, %xmm6 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm9 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1 +; CHECK-NEXT: vmulss %xmm3, %xmm15, %xmm1 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm6 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm8 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm5 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm1, %xmm6, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm5, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm4 * xmm2) + xmm0 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm3 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm12 * xmm3) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 -; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm1 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 +; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm3 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Reload ; CHECK-NEXT: # xmm10 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm7, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm8 = -(xmm8 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm8, %xmm2 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm7, %xmm2 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss %xmm0, %xmm6, %xmm2 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm6 * xmm3) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm3 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm5 * xmm3) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm1 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm6 * xmm11) + xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm9 = -(xmm5 * xmm9) + xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm3 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm11, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 4-byte Folded Reload -; CHECK-NEXT: # xmm14 = -(xmm14 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm14, %xmm4 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm2 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; CHECK-NEXT: vmulss %xmm6, %xmm13, %xmm7 -; CHECK-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 4-byte Folded Reload -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm8 -; CHECK-NEXT: vmulss %xmm5, %xmm15, %xmm5 +; CHECK-NEXT: vmulss %xmm3, %xmm9, %xmm3 +; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 4-byte Folded Reload +; CHECK-NEXT: # xmm11 = -(xmm11 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm11, %xmm3 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm4 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm6 +; CHECK-NEXT: vmulss %xmm5, %xmm14, %xmm5 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm15, %xmm7 +; CHECK-NEXT: vmulss %xmm15, %xmm13, %xmm8 ; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm11 * xmm5) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm8 = -(xmm11 * xmm8) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm9 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm9 = -(xmm11 * xmm9) + xmm0 -; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm11 * xmm7) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm6 = -(xmm11 * xmm6) + xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm11 * xmm5) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm10 ; CHECK-NEXT: vmulss %xmm0, %xmm12, %xmm11 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm11, %xmm11 ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm12 * xmm11) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm12 * xmm10) + xmm0 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm8 = (xmm15 * xmm8) - xmm0 -; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm7 = (xmm13 * xmm7) - xmm0 +; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 +; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm0 +; CHECK-NEXT: vmulss %xmm0, %xmm8, %xmm0 ; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; CHECK-NEXT: vmulss %xmm0, %xmm7, %xmm0 -; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm0, %xmm6, %xmm0 +; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm1 +; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm10, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm8, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm7, %xmm1 ; CHECK-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovss %xmm0, (%rdi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll index 6ae04d5ca2fdb..a6ae7ce5ccd15 100644 --- a/llvm/test/CodeGen/X86/pr57340.ll +++ b/llvm/test/CodeGen/X86/pr57340.ll @@ -13,8 +13,8 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-NEXT: vmovdqu (%rax), %xmm6 -; CHECK-NEXT: vpextrw $0, %xmm6, %eax +; CHECK-NEXT: vmovdqu (%rax), %xmm5 +; CHECK-NEXT: vpextrw $0, %xmm5, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2 @@ -26,16 +26,16 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: vpextrw $0, %xmm3, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm3 -; CHECK-NEXT: vpsrld $16, %xmm6, %xmm4 +; CHECK-NEXT: vpsrld $16, %xmm5, %xmm4 ; CHECK-NEXT: vpextrw $0, %xmm4, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 ; CHECK-NEXT: setne %al ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: vcvtph2ps %xmm3, %xmm5 +; CHECK-NEXT: vcvtph2ps %xmm3, %xmm6 ; CHECK-NEXT: vcvtph2ps %xmm4, %xmm3 ; CHECK-NEXT: kmovw %eax, %k0 -; CHECK-NEXT: vucomiss %xmm5, %xmm3 +; CHECK-NEXT: vucomiss %xmm6, %xmm3 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -68,13 +68,13 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 -; CHECK-NEXT: vcvtph2ps %xmm4, %xmm5 -; CHECK-NEXT: vpsrlq $48, %xmm6, %xmm4 +; CHECK-NEXT: vcvtph2ps %xmm4, %xmm6 +; CHECK-NEXT: vpsrlq $48, %xmm5, %xmm4 ; CHECK-NEXT: vpextrw $0, %xmm4, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm4 ; CHECK-NEXT: vcvtph2ps %xmm4, %xmm4 -; CHECK-NEXT: vucomiss %xmm5, %xmm4 +; CHECK-NEXT: vucomiss %xmm6, %xmm4 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -85,13 +85,13 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-17, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] -; CHECK-NEXT: vpextrw $0, %xmm5, %eax +; CHECK-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,0,1] +; CHECK-NEXT: vpextrw $0, %xmm6, %eax ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm5 -; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 -; CHECK-NEXT: vucomiss %xmm5, %xmm0 +; CHECK-NEXT: vmovd %eax, %xmm6 +; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 +; CHECK-NEXT: vucomiss %xmm6, %xmm0 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -102,18 +102,18 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-33, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm5, %eax +; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm6, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm5 -; CHECK-NEXT: vcvtph2ps %xmm5, %xmm7 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm5, %eax +; CHECK-NEXT: vmovd %eax, %xmm6 +; CHECK-NEXT: vcvtph2ps %xmm6, %xmm7 +; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm6, %eax ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm5 -; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 -; CHECK-NEXT: vucomiss %xmm7, %xmm5 +; CHECK-NEXT: vmovd %eax, %xmm6 +; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 +; CHECK-NEXT: vucomiss %xmm7, %xmm6 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -147,12 +147,12 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm7 ; CHECK-NEXT: vcvtph2ps %xmm7, %xmm7 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpextrw $0, %xmm6, %eax +; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrw $0, %xmm5, %eax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vmovd %eax, %xmm6 -; CHECK-NEXT: vcvtph2ps %xmm6, %xmm6 -; CHECK-NEXT: vucomiss %xmm7, %xmm6 +; CHECK-NEXT: vmovd %eax, %xmm5 +; CHECK-NEXT: vcvtph2ps %xmm5, %xmm5 +; CHECK-NEXT: vucomiss %xmm7, %xmm5 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -254,7 +254,7 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vcvtph2ps %xmm2, %xmm2 -; CHECK-NEXT: vucomiss %xmm2, %xmm5 +; CHECK-NEXT: vucomiss %xmm2, %xmm6 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl @@ -286,7 +286,7 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NEXT: kshiftrw $1, %k0, %k0 -; CHECK-NEXT: vucomiss %xmm0, %xmm6 +; CHECK-NEXT: vucomiss %xmm0, %xmm5 ; CHECK-NEXT: setnp %al ; CHECK-NEXT: sete %cl ; CHECK-NEXT: testb %al, %cl diff --git a/llvm/test/CodeGen/X86/pr59258.ll b/llvm/test/CodeGen/X86/pr59258.ll index fb2d219556632..61ddb24eaaf87 100644 --- a/llvm/test/CodeGen/X86/pr59258.ll +++ b/llvm/test/CodeGen/X86/pr59258.ll @@ -4,7 +4,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-LABEL: cvt_and_clamp2: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $120, %rsp +; CHECK-NEXT: subq $104, %rsp ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, %xmm0 @@ -21,7 +21,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __truncsfhf2@PLT @@ -62,13 +62,13 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: movss (%rsp), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: callq fmaxf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -110,7 +110,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: callq fminf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: movd (%rsp), %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -157,7 +157,7 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind { ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: addq $120, %rsp +; CHECK-NEXT: addq $104, %rsp ; CHECK-NEXT: retq %2 = fptrunc <8 x float> %0 to <8 x half> %3 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> zeroinitializer, <8 x half> %2) diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 7858d125b9da4..d2da4c0f3e86a 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -840,8 +840,8 @@ vector.ph: define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE2OR3-LABEL: test14: ; SSE2OR3: # %bb.0: # %vector.ph -; SSE2OR3-NEXT: pxor %xmm5, %xmm5 -; SSE2OR3-NEXT: movdqa %xmm0, %xmm6 +; SSE2OR3-NEXT: pxor %xmm6, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm0, %xmm5 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm7 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm9 @@ -856,27 +856,27 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; SSE2OR3-NEXT: packuswb %xmm3, %xmm1 ; SSE2OR3-NEXT: psubb %xmm0, %xmm1 ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 -; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2OR3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE2OR3-NEXT: movdqa %xmm2, %xmm0 -; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm3 -; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE2OR3-NEXT: pxor %xmm5, %xmm7 -; SSE2OR3-NEXT: por %xmm5, %xmm6 -; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2OR3-NEXT: pxor %xmm5, %xmm8 -; SSE2OR3-NEXT: por %xmm5, %xmm3 +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE2OR3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; SSE2OR3-NEXT: movdqa %xmm5, %xmm3 +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2OR3-NEXT: pxor %xmm6, %xmm7 +; SSE2OR3-NEXT: por %xmm6, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2OR3-NEXT: pxor %xmm6, %xmm8 +; SSE2OR3-NEXT: por %xmm6, %xmm3 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2OR3-NEXT: packssdw %xmm6, %xmm3 -; SSE2OR3-NEXT: pxor %xmm5, %xmm9 -; SSE2OR3-NEXT: por %xmm5, %xmm2 +; SSE2OR3-NEXT: packssdw %xmm5, %xmm3 +; SSE2OR3-NEXT: pxor %xmm6, %xmm9 +; SSE2OR3-NEXT: por %xmm6, %xmm2 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE2OR3-NEXT: pxor %xmm5, %xmm4 -; SSE2OR3-NEXT: por %xmm5, %xmm0 +; SSE2OR3-NEXT: pxor %xmm6, %xmm4 +; SSE2OR3-NEXT: por %xmm6, %xmm0 ; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE2OR3-NEXT: packssdw %xmm2, %xmm0 ; SSE2OR3-NEXT: packsswb %xmm3, %xmm0 @@ -1669,7 +1669,7 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pxor %xmm9, %xmm0 @@ -1677,8 +1677,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm7, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] @@ -1689,7 +1689,7 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm7, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: movapd %xmm8, %xmm4 @@ -1700,7 +1700,7 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: movdqa %xmm7, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: movapd %xmm8, %xmm3 @@ -1708,14 +1708,14 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; SSE41-NEXT: pxor %xmm1, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pand %xmm7, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 ; SSE41-NEXT: packusdw %xmm3, %xmm8 ; SSE41-NEXT: packusdw %xmm4, %xmm8 -; SSE41-NEXT: psubusw %xmm8, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: psubusw %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_8i64_max: diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index e747692412034..336aa216d19b1 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -452,80 +452,80 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %rcx -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X64-NEXT: movq %xmm3, %rdi +; X64-NEXT: movq %xmm3, %rcx ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: cqto -; X64-NEXT: idivq %rdi +; X64-NEXT: idivq %rcx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: pxor %xmm3, %xmm3 ; X64-NEXT: pcmpgtd %xmm4, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X64-NEXT: movq %xmm4, %r9 +; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: pxor %xmm5, %xmm5 ; X64-NEXT: pcmpgtd %xmm1, %xmm5 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; X64-NEXT: psllq $31, %xmm1 ; X64-NEXT: movq %xmm1, %rax ; X64-NEXT: cqto -; X64-NEXT: idivq %r9 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: idivq %rcx +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; X64-NEXT: movq %xmm4, %r11 ; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] ; X64-NEXT: movq %xmm4, %rax ; X64-NEXT: cqto ; X64-NEXT: idivq %r11 -; X64-NEXT: movq %rsi, %xmm4 ; X64-NEXT: movq %r8, %xmm5 -; X64-NEXT: pxor %xmm6, %xmm6 -; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; X64-NEXT: pcmpeqd %xmm6, %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; X64-NEXT: pand %xmm4, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; X64-NEXT: movq %r10, %xmm6 ; X64-NEXT: pxor %xmm4, %xmm4 -; X64-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; X64-NEXT: pcmpeqd %xmm4, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] +; X64-NEXT: pand %xmm5, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtd %xmm2, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-NEXT: pxor %xmm2, %xmm2 ; X64-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-NEXT: movq %rcx, %xmm0 -; X64-NEXT: pxor %xmm4, %xmm2 -; X64-NEXT: movq %rdi, %xmm4 -; X64-NEXT: pandn %xmm2, %xmm5 -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; X64-NEXT: movdqa %xmm5, %xmm2 -; X64-NEXT: pandn %xmm0, %xmm2 -; X64-NEXT: pcmpeqd %xmm4, %xmm4 -; X64-NEXT: paddq %xmm4, %xmm0 -; X64-NEXT: pand %xmm5, %xmm0 -; X64-NEXT: por %xmm2, %xmm0 -; X64-NEXT: movq %r10, %xmm2 -; X64-NEXT: movq %rdx, %xmm5 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; X64-NEXT: pcmpeqd %xmm6, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,2] -; X64-NEXT: pand %xmm2, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; X64-NEXT: pxor %xmm3, %xmm3 -; X64-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-NEXT: movq %rsi, %xmm0 +; X64-NEXT: pxor %xmm5, %xmm2 +; X64-NEXT: movq %rdi, %xmm5 +; X64-NEXT: pandn %xmm2, %xmm6 +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; X64-NEXT: movdqa %xmm6, %xmm5 +; X64-NEXT: pandn %xmm0, %xmm5 +; X64-NEXT: pcmpeqd %xmm2, %xmm2 +; X64-NEXT: paddq %xmm2, %xmm0 +; X64-NEXT: pand %xmm6, %xmm0 +; X64-NEXT: por %xmm5, %xmm0 +; X64-NEXT: movq %r9, %xmm5 +; X64-NEXT: movq %rdx, %xmm6 +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; X64-NEXT: pcmpeqd %xmm4, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] +; X64-NEXT: pand %xmm5, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtd %xmm3, %xmm5 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-NEXT: pcmpgtd %xmm1, %xmm6 -; X64-NEXT: pxor %xmm3, %xmm6 -; X64-NEXT: pandn %xmm6, %xmm5 -; X64-NEXT: movq %r9, %xmm1 -; X64-NEXT: movq %rax, %xmm2 -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-NEXT: movdqa %xmm5, %xmm2 -; X64-NEXT: pandn %xmm1, %xmm2 -; X64-NEXT: paddq %xmm4, %xmm1 -; X64-NEXT: pand %xmm5, %xmm1 -; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: pcmpgtd %xmm1, %xmm4 +; X64-NEXT: pxor %xmm5, %xmm4 +; X64-NEXT: pandn %xmm4, %xmm6 +; X64-NEXT: movq %rcx, %xmm1 +; X64-NEXT: movq %rax, %xmm3 +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X64-NEXT: movdqa %xmm6, %xmm3 +; X64-NEXT: pandn %xmm1, %xmm3 +; X64-NEXT: paddq %xmm2, %xmm1 +; X64-NEXT: pand %xmm6, %xmm1 +; X64-NEXT: por %xmm3, %xmm1 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index a308f85573c7f..371484e01556c 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -373,33 +373,32 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: subl $88, %esp ; X86-NEXT: movl 8(%ebp), %ecx ; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl 20(%ebp), %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $31, %eax, %edx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: shldl $31, %ecx, %eax -; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll $31, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) ; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi +; X86-NEXT: pushl 16(%ebp) +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ecx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -407,12 +406,13 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: subl $1, %esi ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: testl %ecx, %ecx @@ -420,12 +420,12 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: xorb %al, %dl ; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 20(%ebp) ; X86-NEXT: pushl 16(%ebp) ; X86-NEXT: pushl %ecx -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl %eax @@ -440,14 +440,16 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: sbbl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx @@ -900,28 +902,27 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 40(%ebp), %edx -; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 24(%ebp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: leal (%ecx,%ecx), %esi +; X86-NEXT: leal (%ecx,%ecx), %edx ; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %esi, %ecx -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shldl $31, %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx +; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl 40(%ebp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %eax ; X86-NEXT: pushl %eax -; X86-NEXT: pushl %esi -; X86-NEXT: pushl $0 ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 +; X86-NEXT: pushl %edx ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -929,7 +930,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 28(%ebp) -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -943,16 +943,16 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %bl -; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %bh ; X86-NEXT: xorb %bl, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -962,37 +962,37 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: orl %edi, %esi ; X86-NEXT: setne %bl ; X86-NEXT: testb %bh, %bl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: xorl %edi, %edi +; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: sbbl $0, %esi -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl $0, %esi -; X86-NEXT: cmovgel %edi, %ebx -; X86-NEXT: cmovgel %edi, %edx -; X86-NEXT: cmovgel %edi, %ecx +; X86-NEXT: cmovgel %ebx, %edx +; X86-NEXT: cmovgel %ebx, %ecx +; X86-NEXT: cmovgel %ebx, %edi ; X86-NEXT: movl $-1, %esi ; X86-NEXT: cmovgel %esi, %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: negl %esi ; X86-NEXT: movl $-1, %esi -; X86-NEXT: sbbl %ecx, %esi +; X86-NEXT: sbbl %edi, %esi ; X86-NEXT: movl $-1, %esi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: movl $-1, %edx -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: cmovgel %edi, %eax +; X86-NEXT: sbbl %ecx, %esi +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: cmovgel %ebx, %eax ; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovgel %edx, %ecx -; X86-NEXT: shldl $31, %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovgel %edx, %edi +; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax @@ -1000,12 +1000,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bl ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -1013,42 +1013,42 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: xorb %bl, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl %ecx, %esi ; X86-NEXT: setne %cl ; X86-NEXT: testb %bh, %cl -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %esi ; X86-NEXT: cmovgel %ecx, %edx ; X86-NEXT: cmovgel %ecx, %edi +; X86-NEXT: cmovgel %ecx, %esi ; X86-NEXT: movl $-1, %ebx ; X86-NEXT: cmovgel %ebx, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: negl %ecx ; X86-NEXT: movl $-1, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovgel %ecx, %eax -; X86-NEXT: cmovgel %ebx, %edi -; X86-NEXT: shldl $31, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovgel %ebx, %esi +; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax @@ -1056,12 +1056,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bl ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -1069,40 +1069,40 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: xorb %bl, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: orl %ecx, %esi ; X86-NEXT: setne %cl ; X86-NEXT: testb %bh, %cl -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %eax ; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %esi ; X86-NEXT: cmovgel %ecx, %edx +; X86-NEXT: cmovgel %ecx, %edi ; X86-NEXT: cmovgel %ecx, %ebx -; X86-NEXT: movl $-1, %edi -; X86-NEXT: cmovgel %edi, %eax +; X86-NEXT: movl $-1, %esi +; X86-NEXT: cmovgel %esi, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: negl %ecx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl $-1, %ecx -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: cmovgel %ecx, %eax -; X86-NEXT: cmovgel %edi, %ebx +; X86-NEXT: cmovgel %esi, %ebx ; X86-NEXT: shldl $31, %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll index fbdec48c978f8..213b2b018d0ad 100644 --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -508,43 +508,43 @@ define void @test8(i1 %c, ptr %dst.addr, <6 x i32> %src1,<6 x i32> %src2) nounwi ; ATHLON-NEXT: pushl %edi ; ATHLON-NEXT: pushl %esi ; ATHLON-NEXT: testb $1, {{[0-9]+}}(%esp) -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax -; ATHLON-NEXT: cmovnel %ecx, %eax -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx -; ATHLON-NEXT: cmovnel %edx, %ecx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %esi -; ATHLON-NEXT: cmovnel %edx, %esi -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edi -; ATHLON-NEXT: cmovnel %edx, %edi -; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebx -; ATHLON-NEXT: cmovnel %edx, %ebx +; ATHLON-NEXT: cmovnel %eax, %ebx +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edi +; ATHLON-NEXT: cmovnel %eax, %edi +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %esi +; ATHLON-NEXT: cmovnel %eax, %esi +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %edx +; ATHLON-NEXT: cmovnel %eax, %edx +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ecx +; ATHLON-NEXT: cmovnel %eax, %ecx ; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %ebp -; ATHLON-NEXT: cmovnel %edx, %ebp -; ATHLON-NEXT: movl (%eax), %eax -; ATHLON-NEXT: movl (%ecx), %ecx -; ATHLON-NEXT: movl (%esi), %edx -; ATHLON-NEXT: movl (%edi), %esi -; ATHLON-NEXT: movl (%ebx), %ebx -; ATHLON-NEXT: movl (%ebp), %edi -; ATHLON-NEXT: decl %eax -; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ebp -; ATHLON-NEXT: movl %eax, 20(%ebp) -; ATHLON-NEXT: decl %ecx -; ATHLON-NEXT: movl %ecx, 16(%ebp) -; ATHLON-NEXT: decl %edx -; ATHLON-NEXT: movl %edx, 12(%ebp) -; ATHLON-NEXT: decl %esi -; ATHLON-NEXT: movl %esi, 8(%ebp) +; ATHLON-NEXT: leal {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: cmovnel %ebp, %eax +; ATHLON-NEXT: movl (%ebx), %ebp +; ATHLON-NEXT: movl (%edi), %ebx +; ATHLON-NEXT: movl (%esi), %edi +; ATHLON-NEXT: movl (%edx), %esi +; ATHLON-NEXT: movl (%ecx), %edx +; ATHLON-NEXT: movl (%eax), %ecx +; ATHLON-NEXT: decl %ebp +; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax +; ATHLON-NEXT: movl %ebp, 20(%eax) ; ATHLON-NEXT: decl %ebx -; ATHLON-NEXT: movl %ebx, 4(%ebp) +; ATHLON-NEXT: movl %ebx, 16(%eax) ; ATHLON-NEXT: decl %edi -; ATHLON-NEXT: movl %edi, (%ebp) +; ATHLON-NEXT: movl %edi, 12(%eax) +; ATHLON-NEXT: decl %esi +; ATHLON-NEXT: movl %esi, 8(%eax) +; ATHLON-NEXT: decl %edx +; ATHLON-NEXT: movl %edx, 4(%eax) +; ATHLON-NEXT: decl %ecx +; ATHLON-NEXT: movl %ecx, (%eax) ; ATHLON-NEXT: popl %esi ; ATHLON-NEXT: popl %edi ; ATHLON-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll index ae0b010a1e594..65e3c1f0633d7 100644 --- a/llvm/test/CodeGen/X86/sext-vsetcc.ll +++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll @@ -571,50 +571,50 @@ define <8 x i32> @PR63946(<8 x i32> %a0, <8 x i32> %b0) nounwind { ; SSE-LABEL: PR63946: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,2,3,0] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,2,3,0] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,2,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,2,3,0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,0,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,0,1,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,0,1,2] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,0,1,2] +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm11 ; SSE-NEXT: pcmpeqd %xmm4, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pcmpeqd %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm15 ; SSE-NEXT: pcmpeqd %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pcmpeqd %xmm4, %xmm14 ; SSE-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE-NEXT: pcmpeqd %xmm4, %xmm7 +; SSE-NEXT: pcmpeqd %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pcmpeqd %xmm4, %xmm10 ; SSE-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE-NEXT: pcmpeqd %xmm13, %xmm12 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: pcmpeqd %xmm13, %xmm10 +; SSE-NEXT: pcmpeqd %xmm13, %xmm0 +; SSE-NEXT: por %xmm15, %xmm2 ; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: por %xmm15, %xmm6 -; SSE-NEXT: pcmpeqd %xmm0, %xmm8 +; SSE-NEXT: pcmpeqd %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: por %xmm14, %xmm7 +; SSE-NEXT: pcmpeqd %xmm13, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pcmpeqd %xmm13, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 ; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: por %xmm13, %xmm5 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: packssdw %xmm9, %xmm5 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: packssdw %xmm10, %xmm1 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: packssdw %xmm8, %xmm5 +; SSE-NEXT: pcmpeqd %xmm13, %xmm1 +; SSE-NEXT: packssdw %xmm6, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll index 3cb680396b6ba..f6d73b1fbc6e7 100644 --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -168,22 +168,20 @@ define void @t5ptr(i64 %t, ptr %ptr) nounwind { define i64 @t6(i64 %key, ptr nocapture %val) nounwind { ; X32-LABEL: t6: ; X32: # %bb.0: -; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrdl $3, %eax, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: shrl $3, %edi -; X32-NEXT: movl (%ecx), %eax -; X32-NEXT: movl 4(%ecx), %edx +; X32-NEXT: shrdl $3, %eax, %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: shrl $3, %esi +; X32-NEXT: movl (%edx), %eax +; X32-NEXT: movl 4(%edx), %edx ; X32-NEXT: addl $-1, %eax ; X32-NEXT: adcl $-1, %edx -; X32-NEXT: andl %esi, %eax -; X32-NEXT: andl %edi, %edx +; X32-NEXT: andl %ecx, %eax +; X32-NEXT: andl %esi, %edx ; X32-NEXT: popl %esi -; X32-NEXT: popl %edi ; X32-NEXT: retl ; ; X64-LABEL: t6: diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index aefc4df882c7d..1fe8d834dbcdd 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -160,12 +160,12 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: subl $32, %esp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax +; i686-NEXT: movl {{[0-9]+}}(%esp), %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -177,29 +177,29 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: andb $15, %cl ; i686-NEXT: negb %cl ; i686-NEXT: movsbl %cl, %ebp -; i686-NEXT: movl 24(%esp,%ebp), %edx -; i686-NEXT: movl %edx, %ebx +; i686-NEXT: movl 24(%esp,%ebp), %ebx +; i686-NEXT: movl %ebx, %edx ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %ebx +; i686-NEXT: shll %cl, %edx ; i686-NEXT: notb %cl ; i686-NEXT: movl 20(%esp,%ebp), %edi ; i686-NEXT: movl %edi, %esi ; i686-NEXT: shrl %esi ; i686-NEXT: shrl %cl, %esi -; i686-NEXT: orl %ebx, %esi -; i686-NEXT: movl 16(%esp,%ebp), %ebx +; i686-NEXT: orl %edx, %esi +; i686-NEXT: movl 16(%esp,%ebp), %edx ; i686-NEXT: movl 28(%esp,%ebp), %ebp ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shldl %cl, %edx, %ebp +; i686-NEXT: shldl %cl, %ebx, %ebp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl %ebp, 12(%ecx) -; i686-NEXT: movl %ebx, %edx +; i686-NEXT: movl %edx, %ebx ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %edx -; i686-NEXT: shldl %cl, %ebx, %edi +; i686-NEXT: shll %cl, %ebx +; i686-NEXT: shldl %cl, %edx, %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl %edi, 4(%eax) -; i686-NEXT: movl %edx, (%eax) +; i686-NEXT: movl %ebx, (%eax) ; i686-NEXT: movl %esi, 8(%eax) ; i686-NEXT: addl $32, %esp ; i686-NEXT: popl %esi @@ -407,8 +407,8 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $92, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -435,76 +435,75 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebp, %ebx +; i686-NEXT: movl %edi, %ebx ; i686-NEXT: andl $7, %ebx -; i686-NEXT: shrl $3, %ebp -; i686-NEXT: andl $15, %ebp -; i686-NEXT: movl 32(%esp,%ebp), %eax +; i686-NEXT: shrl $3, %edi +; i686-NEXT: andl $15, %edi +; i686-NEXT: movl 32(%esp,%edi), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl %ebx, %ecx ; i686-NEXT: shrl %cl, %eax ; i686-NEXT: movl %ebx, %ecx ; i686-NEXT: notl %ecx -; i686-NEXT: movl 36(%esp,%ebp), %edx +; i686-NEXT: movl 36(%esp,%edi), %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: addl %edx, %edx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: shll %cl, %edx ; i686-NEXT: orl %eax, %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %edi, %ecx -; i686-NEXT: movl %edi, %edx +; i686-NEXT: movl %ebp, %eax +; i686-NEXT: movl %ebp, %edx ; i686-NEXT: andl $7, %edx -; i686-NEXT: shrl $3, %ecx -; i686-NEXT: andl $15, %ecx -; i686-NEXT: movl 64(%esp,%ecx), %esi -; i686-NEXT: movl %ecx, %edi -; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $15, %eax +; i686-NEXT: movl 64(%esp,%eax), %ebp +; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %eax, (%esp) # 4-byte Spill ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrl %cl, %esi +; i686-NEXT: shrl %cl, %ebp ; i686-NEXT: movl %edx, %ecx ; i686-NEXT: notl %ecx -; i686-NEXT: movl 68(%esp,%edi), %eax -; i686-NEXT: leal (%eax,%eax), %edi +; i686-NEXT: movl 68(%esp,%eax), %esi +; i686-NEXT: leal (%esi,%esi), %eax ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: orl %esi, %edi -; i686-NEXT: movl 28(%esp,%ebp), %ecx +; i686-NEXT: shll %cl, %eax +; i686-NEXT: orl %ebp, %eax +; i686-NEXT: movl 28(%esp,%edi), %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 40(%esp,%ebp), %esi +; i686-NEXT: movl 40(%esp,%edi), %edi ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: movl (%esp), %ecx # 4-byte Reload ; i686-NEXT: movl 60(%esp,%ecx), %ebp ; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl 72(%esp,%ecx), %ebp ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrdl %cl, %ebp, %eax -; i686-NEXT: movl %eax, (%esp) # 4-byte Spill +; i686-NEXT: shrdl %cl, %ebp, %esi +; i686-NEXT: movl %esi, (%esp) # 4-byte Spill ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: sarl %cl, %esi +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: sarl %cl, %edi ; i686-NEXT: movl %edx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shrdl %cl, %eax, %ebx +; i686-NEXT: shrdl %cl, %esi, %ebx ; i686-NEXT: movl %edx, %ecx ; i686-NEXT: sarl %cl, %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl %ebp, 28(%eax) -; i686-NEXT: movl (%esp), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 24(%eax) -; i686-NEXT: movl %ebx, 16(%eax) -; i686-NEXT: movl %esi, 12(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 8(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, (%eax) -; i686-NEXT: movl %edi, 20(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 4(%eax) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movl %ebp, 28(%ecx) +; i686-NEXT: movl (%esp), %edx # 4-byte Reload +; i686-NEXT: movl %edx, 24(%ecx) +; i686-NEXT: movl %ebx, 16(%ecx) +; i686-NEXT: movl %edi, 12(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; i686-NEXT: movl %edx, 8(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; i686-NEXT: movl %edx, (%ecx) +; i686-NEXT: movl %eax, 20(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, 4(%ecx) ; i686-NEXT: addl $92, %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll index 2f4071530382b..0e4e706669300 100644 --- a/llvm/test/CodeGen/X86/shift-i256.ll +++ b/llvm/test/CodeGen/X86/shift-i256.ll @@ -5,6 +5,191 @@ ; CHECK-LABEL: shift1 define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { +; CHECK-LABEL: shift1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: subl $92, %esp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: sarl $31, %esi +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: andb $7, %al +; CHECK-NEXT: shrb $3, %cl +; CHECK-NEXT: movzbl %cl, %ebp +; CHECK-NEXT: movl 32(%esp,%ebp), %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %esi +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: notb %dl +; CHECK-NEXT: movl 36(%esp,%ebp), %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: leal (%ecx,%ecx), %edi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 40(%esp,%ebp), %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %esi +; CHECK-NEXT: movl 44(%esp,%ebp), %ecx +; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill +; CHECK-NEXT: leal (%ecx,%ecx), %edi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 48(%esp,%ebp), %ebx +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %ebx +; CHECK-NEXT: movl 52(%esp,%ebp), %edi +; CHECK-NEXT: leal (%edi,%edi), %esi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %esi +; CHECK-NEXT: orl %ebx, %esi +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill +; CHECK-NEXT: movl 28(%esp,%ebp), %edx +; CHECK-NEXT: movl 56(%esp,%ebp), %ebx +; CHECK-NEXT: shrdl %cl, %ebx, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %ebp, %edx +; CHECK-NEXT: sarl %cl, %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ebx, 28(%eax) +; CHECK-NEXT: movl %edi, 24(%eax) +; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 16(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 8(%eax) +; CHECK-NEXT: movl %edx, (%eax) +; CHECK-NEXT: movl %esi, 20(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 12(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 4(%eax) +; CHECK-NEXT: addl $92, %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl +; +; CHECK-X64-O0-LABEL: shift1: +; CHECK-X64-O0: # %bb.0: # %entry +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: sarq $63, %rcx +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movb %r8b, %dl +; CHECK-X64-O0-NEXT: movb %dl, %cl +; CHECK-X64-O0-NEXT: andb $7, %cl +; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-X64-O0-NEXT: shrb $3, %dl +; CHECK-X64-O0-NEXT: movzbl %dl, %edx +; CHECK-X64-O0-NEXT: movl %edx, %edi +; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi), %rdx +; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi), %r8 +; CHECK-X64-O0-NEXT: movq %r8, %r9 +; CHECK-X64-O0-NEXT: shrq %cl, %r9 +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: notb %cl +; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi), %rsi +; CHECK-X64-O0-NEXT: movq %rsi, %r10 +; CHECK-X64-O0-NEXT: addq %r10, %r10 +; CHECK-X64-O0-NEXT: shlq %cl, %r10 +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: orq %r10, %r9 +; CHECK-X64-O0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi), %rdi +; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: shrdq %cl, %r8, %rdx +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: sarq %cl, %rdi +; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-X64-O0-NEXT: movq %rdi, 24(%rax) +; CHECK-X64-O0-NEXT: movq %rsi, 16(%rax) +; CHECK-X64-O0-NEXT: movq %rdx, (%rax) +; CHECK-X64-O0-NEXT: movq %rcx, 8(%rax) +; CHECK-X64-O0-NEXT: retq +; +; CHECK-X64-O2-LABEL: shift1: +; CHECK-X64-O2: # %bb.0: # %entry +; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: sarq $63, %rcx +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movl %r8d, %eax +; CHECK-X64-O2-NEXT: andb $7, %al +; CHECK-X64-O2-NEXT: shrb $3, %r8b +; CHECK-X64-O2-NEXT: movzbl %r8b, %edx +; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx), %rsi +; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx), %rdi +; CHECK-X64-O2-NEXT: movq %rdi, %r8 +; CHECK-X64-O2-NEXT: movl %eax, %ecx +; CHECK-X64-O2-NEXT: shrq %cl, %r8 +; CHECK-X64-O2-NEXT: notb %cl +; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx), %r10 +; CHECK-X64-O2-NEXT: leaq (%r10,%r10), %r11 +; CHECK-X64-O2-NEXT: shlq %cl, %r11 +; CHECK-X64-O2-NEXT: orq %r8, %r11 +; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rdx +; CHECK-X64-O2-NEXT: movl %eax, %ecx +; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %r10 +; CHECK-X64-O2-NEXT: shrdq %cl, %rdi, %rsi +; CHECK-X64-O2-NEXT: sarq %cl, %rdx +; CHECK-X64-O2-NEXT: movq %rdx, 24(%r9) +; CHECK-X64-O2-NEXT: movq %r10, 16(%r9) +; CHECK-X64-O2-NEXT: movq %rsi, (%r9) +; CHECK-X64-O2-NEXT: movq %r11, 8(%r9) +; CHECK-X64-O2-NEXT: retq entry: %0 = ashr i256 %x, %a store i256 %0, ptr %r @@ -47,12 +232,12 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: shll %cl, %edx ; CHECK-NEXT: notb %cl ; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; CHECK-NEXT: movl 64(%esp,%eax), %edi -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: shrl %edi -; CHECK-NEXT: shrl %cl, %edi -; CHECK-NEXT: orl %edx, %edi -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 64(%esp,%eax), %ebp +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: shrl %ebp +; CHECK-NEXT: shrl %cl, %ebp +; CHECK-NEXT: orl %edx, %ebp +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movl 76(%esp,%eax), %edx ; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movb %ch, %cl @@ -67,8 +252,8 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: movb %ch, %cl ; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: movl 80(%esp,%eax), %ebp -; CHECK-NEXT: movl %ebp, %edx +; CHECK-NEXT: movl 80(%esp,%eax), %edi +; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: shrl %edx ; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload ; CHECK-NEXT: shrl %cl, %edx @@ -77,21 +262,21 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; CHECK-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; CHECK-NEXT: shldl %cl, %esi, %ebp -; CHECK-NEXT: movl 60(%esp,%eax), %edi +; CHECK-NEXT: shldl %cl, %esi, %edi +; CHECK-NEXT: movl 60(%esp,%eax), %ebp ; CHECK-NEXT: movl 88(%esp,%eax), %esi ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; CHECK-NEXT: shldl %cl, %eax, %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %esi, 28(%eax) -; CHECK-NEXT: movl %ebp, 20(%eax) +; CHECK-NEXT: movl %edi, 20(%eax) ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; CHECK-NEXT: movl %esi, 12(%eax) -; CHECK-NEXT: movl %edi, %esi +; CHECK-NEXT: movl %ebp, %esi ; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; CHECK-NEXT: shldl %cl, %edi, %ebp -; CHECK-NEXT: movl %ebp, 4(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: shldl %cl, %ebp, %edi +; CHECK-NEXT: movl %edi, 4(%eax) ; CHECK-NEXT: movl %esi, (%eax) ; CHECK-NEXT: movl %edx, 24(%eax) ; CHECK-NEXT: movl %ebx, 16(%eax) diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index cf41a91737d88..524ecf2aece7e 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1989,10 +1989,10 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: pxor %xmm4, %xmm4 ; X86-SSE-NEXT: movdqa %xmm3, %xmm2 ; X86-SSE-NEXT: pextrw $7, %xmm3, %eax -; X86-SSE-NEXT: pextrw $4, %xmm3, %esi -; X86-SSE-NEXT: pextrw $0, %xmm3, %edi -; X86-SSE-NEXT: pextrw $1, %xmm3, %ebx -; X86-SSE-NEXT: pextrw $3, %xmm3, %ebp +; X86-SSE-NEXT: pextrw $4, %xmm3, %edi +; X86-SSE-NEXT: pextrw $0, %xmm3, %ebp +; X86-SSE-NEXT: pextrw $1, %xmm3, %esi +; X86-SSE-NEXT: pextrw $3, %xmm3, %ebx ; X86-SSE-NEXT: movdqa %xmm3, %xmm5 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] @@ -2009,10 +2009,10 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm4 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X86-SSE-NEXT: movl %esi, %eax +; X86-SSE-NEXT: movl %edi, %eax ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: divl 16(%esi) +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE-NEXT: divl 16(%edi) ; X86-SSE-NEXT: movd %edx, %xmm3 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; X86-SSE-NEXT: movd %xmm2, %eax @@ -2023,20 +2023,20 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X86-SSE-NEXT: movl %edi, %eax +; X86-SSE-NEXT: movl %ebp, %eax ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl (%esi) +; X86-SSE-NEXT: divl (%edi) ; X86-SSE-NEXT: movd %edx, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-SSE-NEXT: movd %xmm2, %ecx -; X86-SSE-NEXT: movl %ebx, %eax +; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm2 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X86-SSE-NEXT: movd %xmm2, %ecx -; X86-SSE-NEXT: movl %ebp, %eax +; X86-SSE-NEXT: movl %ebx, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm2 @@ -2051,24 +2051,24 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl 32(%esi) -; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 +; X86-SSE-NEXT: divl 32(%edi) +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X86-SSE-NEXT: movl %eax, (%eax) ; X86-SSE-NEXT: movdqa %xmm3, (%eax) -; X86-SSE-NEXT: movdqa %xmm1, (%eax) +; X86-SSE-NEXT: movdqa %xmm0, (%eax) ; X86-SSE-NEXT: addl $4, %esp ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %edi @@ -2210,10 +2210,10 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-SSE-NEXT: pxor %xmm4, %xmm4 ; X64-SSE-NEXT: movdqa %xmm3, %xmm2 ; X64-SSE-NEXT: pextrw $7, %xmm3, %eax -; X64-SSE-NEXT: pextrw $4, %xmm3, %edi -; X64-SSE-NEXT: pextrw $0, %xmm3, %r8d -; X64-SSE-NEXT: pextrw $1, %xmm3, %r9d -; X64-SSE-NEXT: pextrw $3, %xmm3, %r10d +; X64-SSE-NEXT: pextrw $4, %xmm3, %r8d +; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d +; X64-SSE-NEXT: pextrw $1, %xmm3, %edi +; X64-SSE-NEXT: pextrw $3, %xmm3, %r9d ; X64-SSE-NEXT: movdqa %xmm3, %xmm5 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] @@ -2230,33 +2230,33 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind { ; X64-SSE-NEXT: divl %r11d ; X64-SSE-NEXT: movd %edx, %xmm4 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X64-SSE-NEXT: movl %edi, %eax +; X64-SSE-NEXT: movl %r8d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 16(%rsi) ; X64-SSE-NEXT: movd %edx, %xmm3 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; X64-SSE-NEXT: movd %xmm2, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-SSE-NEXT: movd %xmm1, %edi +; X64-SSE-NEXT: movd %xmm1, %r8d ; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %edi +; X64-SSE-NEXT: divl %r8d ; X64-SSE-NEXT: movd %edx, %xmm1 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; X64-SSE-NEXT: movl %r8d, %eax +; X64-SSE-NEXT: movl %r10d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl (%rsi) ; X64-SSE-NEXT: movd %edx, %xmm1 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X64-SSE-NEXT: movd %xmm2, %edi -; X64-SSE-NEXT: movl %r9d, %eax +; X64-SSE-NEXT: movd %xmm2, %r8d +; X64-SSE-NEXT: movl %edi, %eax ; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %edi +; X64-SSE-NEXT: divl %r8d ; X64-SSE-NEXT: movd %edx, %xmm2 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; X64-SSE-NEXT: movd %xmm2, %edi -; X64-SSE-NEXT: movl %r10d, %eax +; X64-SSE-NEXT: movl %r9d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm2 diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll index d6906b573981a..2d59422953eb3 100644 --- a/llvm/test/CodeGen/X86/smax.ll +++ b/llvm/test/CodeGen/X86/smax.ll @@ -385,26 +385,26 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmovgl %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovgl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovgl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 4(%edx) -; X86-NEXT: movl %ebp, (%edx) -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %eax, 16(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -721,17 +721,17 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shrdl $28, %edi, %ecx ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: cmovll %edx, %ecx -; X86-NEXT: cmovll %esi, %edi +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: cmovll %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %ecx, (%eax) diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll index 2b059557cdfb5..bde61d5738ed5 100644 --- a/llvm/test/CodeGen/X86/smin.ll +++ b/llvm/test/CodeGen/X86/smin.ll @@ -386,26 +386,26 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmovll %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovll %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 4(%edx) -; X86-NEXT: movl %ebp, (%edx) -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %eax, 16(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -722,17 +722,17 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shrdl $28, %edi, %ecx ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: cmovll %edx, %ecx -; X86-NEXT: cmovll %esi, %edi +; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: cmovll %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %ecx, (%eax) diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll index fbdb6e703fefd..0e17af441d649 100644 --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -196,110 +196,81 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: andl $1, %ebp +; X86-NEXT: negl %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl $1, %eax -; X86-NEXT: negl %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: setb %bl ; X86-NEXT: addl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movzbl %bl, %esi ; X86-NEXT: adcl %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edx, %ebp +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb %bl +; X86-NEXT: setb %cl ; X86-NEXT: addl %eax, %edi -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %esi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: setb %al -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -308,103 +279,127 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -414,248 +409,248 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %esi +; X86-NEXT: addl %ebp, %edi ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ebp, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %al, %ebp -; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movzbl %cl, %ebx -; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %eax -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: adcl $0, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %eax, %ebp ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: setb %bl ; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: adcl %edx, %edi +; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %esi ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: setb %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl %edi, %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edx +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %edx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: setb %cl -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: setb %al +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb %cl -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: setb %al +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movzbl %bl, %edi -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: setb %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %eax, %eax -; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -663,113 +658,111 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: addl %edx, %eax ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: setb %al -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movzbl %al, %edi -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movzbl %al, %ebp +; X86-NEXT: adcl %esi, %ebp ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl %esi, %ebx ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: setb %bl +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %edi, %esi +; X86-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %ebp, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl %esi, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %esi +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, %edi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ebp ; X86-NEXT: xorl %edx, %eax -; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %ebp, %eax ; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: xorl %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -778,13 +771,12 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: xorl %edx, %ebp -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: movl %edi, %esi ; X86-NEXT: xorl %edx, %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: xorl %edx, %edi ; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %edi, %edx ; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %ebp, %edx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %ecx @@ -828,170 +820,169 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind { ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %r8, %r13 -; X64-NEXT: movq %rcx, %r10 -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %r9, %r15 +; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; X64-NEXT: andl $1, %r11d ; X64-NEXT: negq %r11 -; X64-NEXT: andl $1, %r10d -; X64-NEXT: negq %r10 -; X64-NEXT: movq %r10, %rax +; X64-NEXT: andl $1, %r9d +; X64-NEXT: negq %r9 +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rdx, %rdi -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: addq %rdx, %rbp +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rax, %rdi -; X64-NEXT: adcq %rdx, %r8 -; X64-NEXT: setb %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: addq %rax, %r8 +; X64-NEXT: addq %rax, %rbp ; X64-NEXT: adcq %rdx, %rcx -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: setb %sil +; X64-NEXT: movzbl %sil, %edi +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: adcq %rdx, %rdi +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r10, %r13 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbp, %rax +; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r13, %rsi -; X64-NEXT: setb %bpl -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: adcq %rbx, %rsi +; X64-NEXT: setb %r8b +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movzbl %bpl, %edx -; X64-NEXT: adcq %rdx, %r14 -; X64-NEXT: addq %r12, %rax -; X64-NEXT: movq %r12, %r9 +; X64-NEXT: movzbl %r8b, %edx +; X64-NEXT: adcq %rdx, %rbx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: adcq %rdi, %r14 -; X64-NEXT: adcq $0, %r8 +; X64-NEXT: adcq %rbp, %rbx ; X64-NEXT: adcq $0, %rcx +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %r13, %r15 +; X64-NEXT: addq %r13, %r14 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %rbx, %r15 +; X64-NEXT: addq %r15, %r14 ; X64-NEXT: adcq %r13, %rbp ; X64-NEXT: setb %al -; X64-NEXT: addq %rdi, %rbp +; X64-NEXT: addq %r8, %rbp ; X64-NEXT: movzbl %al, %r12d ; X64-NEXT: adcq %rdx, %r12 -; X64-NEXT: addq %rbx, %rsi +; X64-NEXT: addq %r15, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %r15 +; X64-NEXT: adcq %rbx, %r14 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: addq %r8, %rbp -; X64-NEXT: adcq %rcx, %r12 +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: adcq %rdi, %r12 ; X64-NEXT: setb %cl -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: addq %rax, %r8 -; X64-NEXT: adcq %rdx, %r14 -; X64-NEXT: setb %dil -; X64-NEXT: addq %rax, %r14 -; X64-NEXT: movzbl %dil, %esi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %rax, %r10 +; X64-NEXT: adcq %rdx, %rdi +; X64-NEXT: setb %bl +; X64-NEXT: addq %rax, %rdi +; X64-NEXT: movzbl %bl, %esi ; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: addq %rax, %rbp -; X64-NEXT: adcq %r12, %r8 +; X64-NEXT: adcq %r12, %r10 ; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: movq %rsi, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: addq %rax, %rdi +; X64-NEXT: addq %rax, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %r9, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: addq %rbx, %r8 ; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: setb %al ; X64-NEXT: addq %rsi, %rcx ; X64-NEXT: movzbl %al, %esi ; X64-NEXT: adcq %rdx, %rsi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: imulq %r11 -; X64-NEXT: movq %r9, %r11 +; X64-NEXT: movq %rbx, %r11 ; X64-NEXT: addq %rax, %r11 -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %r8, %r12 ; X64-NEXT: adcq %rdx, %r12 ; X64-NEXT: addq %rcx, %r11 ; X64-NEXT: adcq %rsi, %r12 -; X64-NEXT: movq %rbx, %r10 -; X64-NEXT: addq %r13, %r10 +; X64-NEXT: movq %r15, %r9 +; X64-NEXT: addq %r13, %r9 ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: adcq %rsi, %r13 -; X64-NEXT: setb %r9b +; X64-NEXT: setb %bl ; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movzbl %r9b, %ecx +; X64-NEXT: movzbl %bl, %ecx ; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: adcq %r10, %rdx +; X64-NEXT: addq %r15, %rax +; X64-NEXT: adcq %r9, %rdx ; X64-NEXT: addq %r13, %rax ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload +; X64-NEXT: adcq %r8, %r9 ; X64-NEXT: adcq %r11, %rax ; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: adcq %r8, %r10 -; X64-NEXT: adcq %r14, %rax +; X64-NEXT: addq %rbp, %r15 +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: adcq %rdi, %rax ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: sarq $63, %rcx ; X64-NEXT: xorq %rcx, %rdx -; X64-NEXT: xorq %rcx, %r10 -; X64-NEXT: orq %rdx, %r10 +; X64-NEXT: xorq %rcx, %r9 +; X64-NEXT: orq %rdx, %r9 ; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %rbx, %rcx +; X64-NEXT: xorq %r15, %rcx ; X64-NEXT: orq %rax, %rcx -; X64-NEXT: orq %r10, %rcx +; X64-NEXT: orq %r9, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movl %eax, %esi ; X64-NEXT: andl $1, %esi ; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: negq %rdx -; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: xorq %rdx, %r14 ; X64-NEXT: xorq %rax, %rdx -; X64-NEXT: orq %r15, %rdx +; X64-NEXT: orq %r14, %rdx ; X64-NEXT: orq %rcx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index 8c2b945d6a8ce..ce56283df6010 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -56,28 +56,28 @@ define i64 @func2(i64 %x, i64 %y) { ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %ebp, %edi +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: imull %ebx, %edi ; X86-NEXT: addl %edi, %edx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: subl %ecx, %edi -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: cmovsl %edi, %edx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovsl %ecx, %edx ; X86-NEXT: shldl $30, %eax, %edx ; X86-NEXT: shldl $30, %esi, %eax diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index 75fc1d34fa1cb..85c966c447fad 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -60,61 +60,61 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %edi, %eax -; X86-NEXT: imull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %esi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %ebp -; X86-NEXT: cmovnsl %edx, %esi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: cmovnsl %esi, %ebp +; X86-NEXT: cmovnsl %edx, %edi +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ebp, %edx ; X86-NEXT: sbbl $0, %edx ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: cmovnsl %ebp, %edx -; X86-NEXT: cmovnsl %esi, %ecx +; X86-NEXT: cmovnsl %edi, %ecx ; X86-NEXT: testl %edx, %edx -; X86-NEXT: setg %bl -; X86-NEXT: sete %bh +; X86-NEXT: setg %ah +; X86-NEXT: sete (%esp) # 1-byte Folded Spill ; X86-NEXT: cmpl $2, %ecx ; X86-NEXT: setae %al -; X86-NEXT: andb %bh, %al -; X86-NEXT: orb %bl, %al -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: shrdl $2, %edi, %ebx -; X86-NEXT: shrdl $2, %ecx, %edi +; X86-NEXT: andb (%esp), %al # 1-byte Folded Reload +; X86-NEXT: orb %ah, %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shrdl $2, %ebx, %ebp +; X86-NEXT: shrdl $2, %ecx, %ebx ; X86-NEXT: testb %al, %al ; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: cmovel %ebx, %esi ; X86-NEXT: movl $-1, %edi -; X86-NEXT: cmovel %ebx, %edi +; X86-NEXT: cmovel %ebp, %edi ; X86-NEXT: cmpl $-1, %edx ; X86-NEXT: setl %dl ; X86-NEXT: sete %al @@ -126,7 +126,7 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: cmovel %edi, %eax ; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 ; X86-NEXT: cmovel %esi, %edx -; X86-NEXT: addl $4, %esp +; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -376,11 +376,12 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: imull %edx, %ebx ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi @@ -398,44 +399,45 @@ define i64 @func5(i64 %x, i64 %y) { ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %eax, %esi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X86-NEXT: adcl %edi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: xorl %eax, %edi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: xorl %ebx, %edx +; X86-NEXT: xorl %eax, %ebx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: orl %edx, %ebx ; X86-NEXT: notl %ecx ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: cmovel %ebp, %esi +; X86-NEXT: cmovel %esi, %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: addl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi @@ -627,32 +629,31 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: subl %esi, %ecx -; X86-NEXT: movl %edi, %esi +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %edi, %esi +; X86-NEXT: cmovnsl %ebx, %esi ; X86-NEXT: cmovsl %ecx, %edx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index 3c4bb043800b7..abab313f4b12e 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -89,183 +89,180 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebp, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %edi, %edx +; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl %eax, %edx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %esi +; X86-NEXT: addl %eax, %edi ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %edx, %edi +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: imull %ecx, %eax @@ -277,18 +274,18 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: movl %edx, %ecx @@ -296,9 +293,9 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: xorl %ecx, %ebp ; X86-NEXT: orl %eax, %ebp -; X86-NEXT: xorl %ecx, %edi +; X86-NEXT: xorl %ecx, %esi ; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: orl %edi, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, 12(%eax) @@ -344,183 +341,184 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 ; X64-NEXT: movq %rcx, %r13 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rsi, %r11 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rsi, %r10 ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq %r13, %rax ; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rcx, %r14 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r9, %rcx ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rbx, %r14 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r14, %rbx ; X64-NEXT: adcq %rsi, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r10d +; X64-NEXT: movzbl %al, %r9d ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r12, %rsi -; X64-NEXT: adcq %r10, %rcx +; X64-NEXT: adcq %r9, %rcx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rbx, %r13 +; X64-NEXT: addq %r9, %r13 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdi, %r15 +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r12, %rbx -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: adcq %r12, %r9 +; X64-NEXT: setb %dil +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rbx, %rbp -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r9, %rbp +; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: addq %r15, %rbp -; X64-NEXT: adcq %r14, %r13 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; X64-NEXT: addq %r11, %rbp +; X64-NEXT: adcq %rbx, %r13 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r14 -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %r11, %rbx -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r10, %r14 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdi, %r10 -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %r9, %r10 +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r10, %r14 -; X64-NEXT: adcq %r9, %r11 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 +; X64-NEXT: adcq %r11, %rdi ; X64-NEXT: setb %r10b -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r11, %r9 +; X64-NEXT: addq %rdi, %r9 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: addq %rbp, %r15 +; X64-NEXT: adcq %rax, %r11 +; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r13, %r15 ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r13, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq %rcx, %rdi +; X64-NEXT: adcq %rcx, %r11 ; X64-NEXT: setb %bl -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Reload -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload +; X64-NEXT: movq %r10, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %r10, %rax -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %cl -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r11, %r15 +; X64-NEXT: addq %r10, %r15 ; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: addq %r9, %rbp -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rdi, %r10 -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: addq %r9, %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r11, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %r14, %rdi -; X64-NEXT: movq %r14, %rbp +; X64-NEXT: movq %rbp, %rdi ; X64-NEXT: sarq $63, %rdi ; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r9, %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r9, %r10 -; X64-NEXT: setb %cl -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: imulq %r12, %rsi +; X64-NEXT: addq %r10, %r11 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r10, %r9 +; X64-NEXT: setb %sil +; X64-NEXT: movq %rdi, %r8 +; X64-NEXT: imulq %r12, %r8 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq {{[0-9]+}}(%rsp) -; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: addq %r8, %rdx ; X64-NEXT: addq %rax, %rdx ; X64-NEXT: addq %rbx, %rax -; X64-NEXT: adcq %r8, %rdx -; X64-NEXT: addq %r11, %r10 -; X64-NEXT: movzbl %cl, %esi -; X64-NEXT: adcq %r14, %rsi -; X64-NEXT: addq %rax, %r10 +; X64-NEXT: adcq %r11, %rdx +; X64-NEXT: addq %r14, %r9 +; X64-NEXT: movzbl %sil, %esi +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: addq %rax, %r9 ; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: sarq $63, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rdx, %r14 -; X64-NEXT: adcq $0, %r11 +; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rax, %r14 -; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: adcq %rdx, %rdi ; X64-NEXT: setb %bl ; X64-NEXT: imulq %r12, %rbp ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload @@ -529,28 +527,28 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X64-NEXT: addq %rbp, %rdx ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: adcq %r14, %rdx -; X64-NEXT: addq %r9, %r11 -; X64-NEXT: movzbl %bl, %r9d -; X64-NEXT: adcq %rdi, %r9 -; X64-NEXT: addq %rax, %r11 -; X64-NEXT: adcq %rdx, %r9 +; X64-NEXT: addq %r10, %rdi +; X64-NEXT: movzbl %bl, %r10d +; X64-NEXT: adcq %r8, %r10 +; X64-NEXT: addq %rax, %rdi +; X64-NEXT: adcq %rdx, %r10 ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: adcq %r8, %r14 -; X64-NEXT: adcq %r10, %r11 -; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: adcq %r9, %rdi +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload -; X64-NEXT: adcq %r15, %r11 -; X64-NEXT: adcq %r13, %r9 +; X64-NEXT: adcq %r15, %rdi +; X64-NEXT: adcq %r13, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: sarq $63, %rax -; X64-NEXT: xorq %rax, %r9 +; X64-NEXT: xorq %rax, %r10 ; X64-NEXT: xorq %rax, %r14 -; X64-NEXT: orq %r9, %r14 -; X64-NEXT: xorq %rax, %r11 +; X64-NEXT: orq %r10, %r14 +; X64-NEXT: xorq %rax, %rdi ; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: orq %r11, %rax +; X64-NEXT: orq %rdi, %rax ; X64-NEXT: orq %r14, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq %rdx, 24(%rax) @@ -585,548 +583,532 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %esi ; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb %bl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %esi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edi +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload +; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: movl (%esp), %edi ## 4-byte Reload +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: adcl $0, %eax -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %edi -; X86-NEXT: setb %bl -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl %ecx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl (%esp), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill @@ -1135,8 +1117,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx @@ -1147,11 +1129,10 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %ecx, %eax @@ -1161,37 +1142,36 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %al -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill -; X86-NEXT: addl %edi, %edx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebp, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %eax, %edi ; X86-NEXT: setb %al -; X86-NEXT: addl %esi, %edi +; X86-NEXT: addl %ebp, %edi ; X86-NEXT: movzbl %al, %edx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl %edx, %eax @@ -1208,7 +1188,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: addl %ebp, %edi ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax @@ -1218,250 +1198,248 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edx, %esi +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: imull %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %esi ; X86-NEXT: addl %eax, %edx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl (%esp), %eax ## 4-byte Reload +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl %eax, %edx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl %eax, %edx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl (%esp), %edx ## 4-byte Reload +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: addl %esi, %eax +; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movzbl (%esp), %ebp ## 1-byte Folded Reload +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: adcl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb %cl ; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %esi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: adcl %edi, (%esp) ## 4-byte Folded Spill +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: setb %al ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl %eax, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: setb %al +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: addl %edx, %eax +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl %ebx, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %ebp, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: imull %ebp, %ebx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: addl %eax, %edx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, %esi ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: xorl %eax, %edx -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: orl %edx, %edi ; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: xorl %eax, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; X86-NEXT: xorl %eax, %edx +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload ; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: orl %ecx, %esi ; X86-NEXT: xorl %eax, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: xorl %eax, %edi ; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: orl %edi, %eax ; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl %esi, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %ebp, 28(%eax) diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll index c07c20d8f414a..c8c1026bdaf3f 100644 --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -175,6 +175,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_power_of_two: ; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pextrw $1, %xmm0, %eax ; SSE-NEXT: leal 31(%rax), %ecx ; SSE-NEXT: testw %ax, %ax @@ -187,16 +188,16 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-NEXT: cmovnsl %ecx, %edx ; SSE-NEXT: andl $-64, %edx ; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movd %ecx, %xmm0 +; SSE-NEXT: pinsrw $1, %eax, %xmm0 +; SSE-NEXT: pextrw $2, %xmm1, %eax ; SSE-NEXT: leal 7(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx ; SSE-NEXT: andl $-8, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm0 +; SSE-NEXT: pextrw $3, %xmm1, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 ; SSE-NEXT: shrl $16, %ecx @@ -208,8 +209,7 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: imull $95, %edx, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pinsrw $3, %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_srem_power_of_two: @@ -257,32 +257,32 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: pextrw $2, %xmm0, %ecx +; SSE-NEXT: movswl %cx, %eax +; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217 +; SSE-NEXT: shrl $16, %eax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: movzwl %ax, %edx +; SSE-NEXT: movswl %dx, %eax +; SSE-NEXT: shrl $15, %edx +; SSE-NEXT: sarl $4, %eax +; SSE-NEXT: addl %edx, %eax +; SSE-NEXT: leal (%rax,%rax,2), %edx +; SSE-NEXT: shll $3, %edx +; SSE-NEXT: subl %edx, %eax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: pextrw $1, %xmm0, %ecx ; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; SSE-NEXT: movl %ecx, %esi +; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B +; SSE-NEXT: movl %edx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %ecx -; SSE-NEXT: addl %esi, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: sarl $23, %edx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E +; SSE-NEXT: subl %edx, %ecx ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll index 0226052402cb8..6f0293392eef2 100644 --- a/llvm/test/CodeGen/X86/sse-regcall.ll +++ b/llvm/test/CodeGen/X86/sse-regcall.ll @@ -77,35 +77,35 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, ; WIN32-NEXT: movaps %xmm7, (%esp) # 16-byte Spill ; WIN32-NEXT: movaps %xmm6, %xmm7 ; WIN32-NEXT: movaps %xmm5, %xmm6 -; WIN32-NEXT: movaps %xmm3, %xmm5 -; WIN32-NEXT: movaps %xmm2, %xmm3 -; WIN32-NEXT: movaps %xmm1, %xmm2 +; WIN32-NEXT: movaps %xmm4, %xmm5 +; WIN32-NEXT: movaps %xmm1, %xmm4 ; WIN32-NEXT: movaps %xmm0, %xmm1 -; WIN32-NEXT: addps %xmm4, %xmm0 -; WIN32-NEXT: mulps %xmm4, %xmm1 +; WIN32-NEXT: addps %xmm5, %xmm0 +; WIN32-NEXT: mulps %xmm5, %xmm1 ; WIN32-NEXT: subps %xmm1, %xmm0 ; WIN32-NEXT: movups 8(%ebp), %xmm1 ; WIN32-NEXT: addps %xmm1, %xmm0 +; WIN32-NEXT: movaps %xmm4, %xmm1 +; WIN32-NEXT: addps %xmm6, %xmm1 +; WIN32-NEXT: mulps %xmm6, %xmm4 +; WIN32-NEXT: subps %xmm4, %xmm1 +; WIN32-NEXT: movups 24(%ebp), %xmm4 +; WIN32-NEXT: addps %xmm4, %xmm1 ; WIN32-NEXT: movaps %xmm2, %xmm4 -; WIN32-NEXT: addps %xmm6, %xmm4 -; WIN32-NEXT: mulps %xmm6, %xmm2 +; WIN32-NEXT: addps %xmm7, %xmm4 +; WIN32-NEXT: mulps %xmm7, %xmm2 ; WIN32-NEXT: subps %xmm2, %xmm4 -; WIN32-NEXT: movups 24(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm4 -; WIN32-NEXT: movaps %xmm3, %xmm2 -; WIN32-NEXT: addps %xmm7, %xmm2 -; WIN32-NEXT: mulps %xmm7, %xmm3 -; WIN32-NEXT: subps %xmm3, %xmm2 -; WIN32-NEXT: movups 40(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm2 +; WIN32-NEXT: movups 40(%ebp), %xmm2 +; WIN32-NEXT: addps %xmm2, %xmm4 +; WIN32-NEXT: movaps %xmm3, %xmm5 +; WIN32-NEXT: movaps (%esp), %xmm2 # 16-byte Reload +; WIN32-NEXT: addps %xmm2, %xmm5 +; WIN32-NEXT: mulps %xmm2, %xmm3 +; WIN32-NEXT: subps %xmm3, %xmm5 +; WIN32-NEXT: movups 56(%ebp), %xmm2 +; WIN32-NEXT: addps %xmm2, %xmm5 +; WIN32-NEXT: movaps %xmm4, %xmm2 ; WIN32-NEXT: movaps %xmm5, %xmm3 -; WIN32-NEXT: movaps (%esp), %xmm1 # 16-byte Reload -; WIN32-NEXT: addps %xmm1, %xmm3 -; WIN32-NEXT: mulps %xmm1, %xmm5 -; WIN32-NEXT: subps %xmm5, %xmm3 -; WIN32-NEXT: movups 56(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm3 -; WIN32-NEXT: movaps %xmm4, %xmm1 ; WIN32-NEXT: movl %ebp, %esp ; WIN32-NEXT: popl %ebp ; WIN32-NEXT: retl @@ -198,44 +198,43 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: subl $12, %esp ; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill -; WIN32-NEXT: movl %edi, %esi -; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: leal (%edx,%esi), %eax +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: leal (%edx,%edi), %eax ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: subl %esi, %ebx -; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: subl %ecx, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl %ebp, %ecx -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: imull %eax, %ecx +; WIN32-NEXT: movl %edx, %eax +; WIN32-NEXT: subl %edi, %eax +; WIN32-NEXT: movl %ebp, %edx +; WIN32-NEXT: subl %ecx, %edx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: imull %edx, %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: movl %esi, %edx +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: imull %eax, %edx +; WIN32-NEXT: addl %ebx, %edx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl (%esp), %edi # 4-byte Reload +; WIN32-NEXT: subl %ebx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %ebx, %eax -; WIN32-NEXT: addl %ecx, %eax -; WIN32-NEXT: movl (%esp), %ebx # 4-byte Reload -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: imull %edi, %eax +; WIN32-NEXT: addl %edx, %eax +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; WIN32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: imull %ebx, %ecx -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: imull %ebp, %edi +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: imull %edx, %ebp ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; WIN32-NEXT: addl %esi, %edi -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull %eax, %edx -; WIN32-NEXT: addl %edx, %edi -; WIN32-NEXT: addl %ecx, %edi -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: addl %esi, %ebp +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imull %ebx, %ecx +; WIN32-NEXT: addl %ecx, %ebp +; WIN32-NEXT: addl %eax, %ebp +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: addl $12, %esp ; WIN32-NEXT: popl %ebx ; WIN32-NEXT: popl %ebp @@ -243,6 +242,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: +; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -255,35 +255,36 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx -; WIN64-NEXT: subl %edi, %edx -; WIN64-NEXT: leal (%rsi,%r8), %edi +; WIN64-NEXT: movl %edx, %ebp +; WIN64-NEXT: subl %edi, %ebp +; WIN64-NEXT: leal (%rsi,%r8), %edx ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r10), %r8d -; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 -; WIN64-NEXT: subl %r10d, %r9d -; WIN64-NEXT: movl %eax, %r10d -; WIN64-NEXT: subl %ecx, %r10d -; WIN64-NEXT: imull %r10d, %r9d -; WIN64-NEXT: leal (%r11,%r12), %r10d -; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 -; WIN64-NEXT: subl %r12d, %r11d -; WIN64-NEXT: imull %edx, %r11d -; WIN64-NEXT: addl %r9d, %r11d -; WIN64-NEXT: leal (%r14,%r15), %edx -; WIN64-NEXT: movl %r14d, %r9d -; WIN64-NEXT: subl %r15d, %r9d -; WIN64-NEXT: imull %esi, %r9d -; WIN64-NEXT: addl %r11d, %r9d +; WIN64-NEXT: leal (%r9,%r10), %edi +; WIN64-NEXT: movl %r9d, %r8d +; WIN64-NEXT: subl %r10d, %r8d +; WIN64-NEXT: movl %eax, %r9d +; WIN64-NEXT: subl %ecx, %r9d +; WIN64-NEXT: imull %r9d, %r8d +; WIN64-NEXT: leal (%r11,%r12), %r9d +; WIN64-NEXT: movl %r11d, %r10d +; WIN64-NEXT: subl %r12d, %r10d +; WIN64-NEXT: imull %ebp, %r10d +; WIN64-NEXT: addl %r8d, %r10d +; WIN64-NEXT: leal (%r14,%r15), %r8d +; WIN64-NEXT: movl %r14d, %r11d +; WIN64-NEXT: subl %r15d, %r11d +; WIN64-NEXT: imull %esi, %r11d +; WIN64-NEXT: addl %r10d, %r11d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %r8d, %eax -; WIN64-NEXT: imull %ebx, %r10d -; WIN64-NEXT: addl %r10d, %eax -; WIN64-NEXT: imull %edi, %edx -; WIN64-NEXT: addl %edx, %eax +; WIN64-NEXT: imull %edi, %eax +; WIN64-NEXT: imull %ebx, %r9d ; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: imull %edx, %r8d +; WIN64-NEXT: addl %r8d, %eax +; WIN64-NEXT: addl %r11d, %eax ; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX-LABEL: testi32_inp: @@ -297,35 +298,35 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX-NEXT: # kill: def $edx killed $edx killed $rdx -; LINUXOSX-NEXT: subl %edi, %edx -; LINUXOSX-NEXT: leal (%rsi,%r8), %edi +; LINUXOSX-NEXT: movl %edx, %r11d +; LINUXOSX-NEXT: subl %edi, %r11d +; LINUXOSX-NEXT: leal (%rsi,%r8), %edx ; LINUXOSX-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX-NEXT: subl %r8d, %esi -; LINUXOSX-NEXT: leal (%r9,%r12), %r8d -; LINUXOSX-NEXT: # kill: def $r9d killed $r9d killed $r9 -; LINUXOSX-NEXT: subl %r12d, %r9d -; LINUXOSX-NEXT: movl %eax, %r11d -; LINUXOSX-NEXT: subl %ecx, %r11d -; LINUXOSX-NEXT: imull %r11d, %r9d -; LINUXOSX-NEXT: leal (%r13,%r14), %r11d +; LINUXOSX-NEXT: leal (%r9,%r12), %edi +; LINUXOSX-NEXT: movl %r9d, %r8d +; LINUXOSX-NEXT: subl %r12d, %r8d +; LINUXOSX-NEXT: movl %eax, %r9d +; LINUXOSX-NEXT: subl %ecx, %r9d +; LINUXOSX-NEXT: imull %r9d, %r8d +; LINUXOSX-NEXT: leal (%r13,%r14), %r9d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d -; LINUXOSX-NEXT: imull %edx, %r12d -; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx -; LINUXOSX-NEXT: addl %r9d, %r12d -; LINUXOSX-NEXT: movl %r15d, %r9d -; LINUXOSX-NEXT: subl %edx, %r9d -; LINUXOSX-NEXT: imull %esi, %r9d -; LINUXOSX-NEXT: addl %r12d, %r9d +; LINUXOSX-NEXT: imull %r11d, %r12d +; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; LINUXOSX-NEXT: addl %r8d, %r12d +; LINUXOSX-NEXT: movl %r15d, %r8d +; LINUXOSX-NEXT: subl %r11d, %r8d +; LINUXOSX-NEXT: imull %esi, %r8d +; LINUXOSX-NEXT: addl %r12d, %r8d ; LINUXOSX-NEXT: addl %ecx, %eax -; LINUXOSX-NEXT: imull %r8d, %eax -; LINUXOSX-NEXT: imull %r10d, %r11d -; LINUXOSX-NEXT: addl %r11d, %eax -; LINUXOSX-NEXT: addl %r15d, %edx -; LINUXOSX-NEXT: imull %edi, %edx -; LINUXOSX-NEXT: addl %edx, %eax +; LINUXOSX-NEXT: imull %edi, %eax +; LINUXOSX-NEXT: imull %r10d, %r9d ; LINUXOSX-NEXT: addl %r9d, %eax +; LINUXOSX-NEXT: addl %r15d, %r11d +; LINUXOSX-NEXT: imull %edx, %r11d +; LINUXOSX-NEXT: addl %r11d, %eax +; LINUXOSX-NEXT: addl %r8d, %eax ; LINUXOSX-NEXT: retq i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { %x1 = sub i32 %a1, %a2 diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll index 80eaf0f900066..c8df7a233d7e3 100644 --- a/llvm/test/CodeGen/X86/sse-regcall4.ll +++ b/llvm/test/CodeGen/X86/sse-regcall4.ll @@ -77,35 +77,35 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, ; WIN32-NEXT: movaps %xmm7, (%esp) # 16-byte Spill ; WIN32-NEXT: movaps %xmm6, %xmm7 ; WIN32-NEXT: movaps %xmm5, %xmm6 -; WIN32-NEXT: movaps %xmm3, %xmm5 -; WIN32-NEXT: movaps %xmm2, %xmm3 -; WIN32-NEXT: movaps %xmm1, %xmm2 +; WIN32-NEXT: movaps %xmm4, %xmm5 +; WIN32-NEXT: movaps %xmm1, %xmm4 ; WIN32-NEXT: movaps %xmm0, %xmm1 -; WIN32-NEXT: addps %xmm4, %xmm0 -; WIN32-NEXT: mulps %xmm4, %xmm1 +; WIN32-NEXT: addps %xmm5, %xmm0 +; WIN32-NEXT: mulps %xmm5, %xmm1 ; WIN32-NEXT: subps %xmm1, %xmm0 ; WIN32-NEXT: movups 8(%ebp), %xmm1 ; WIN32-NEXT: addps %xmm1, %xmm0 +; WIN32-NEXT: movaps %xmm4, %xmm1 +; WIN32-NEXT: addps %xmm6, %xmm1 +; WIN32-NEXT: mulps %xmm6, %xmm4 +; WIN32-NEXT: subps %xmm4, %xmm1 +; WIN32-NEXT: movups 24(%ebp), %xmm4 +; WIN32-NEXT: addps %xmm4, %xmm1 ; WIN32-NEXT: movaps %xmm2, %xmm4 -; WIN32-NEXT: addps %xmm6, %xmm4 -; WIN32-NEXT: mulps %xmm6, %xmm2 +; WIN32-NEXT: addps %xmm7, %xmm4 +; WIN32-NEXT: mulps %xmm7, %xmm2 ; WIN32-NEXT: subps %xmm2, %xmm4 -; WIN32-NEXT: movups 24(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm4 -; WIN32-NEXT: movaps %xmm3, %xmm2 -; WIN32-NEXT: addps %xmm7, %xmm2 -; WIN32-NEXT: mulps %xmm7, %xmm3 -; WIN32-NEXT: subps %xmm3, %xmm2 -; WIN32-NEXT: movups 40(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm2 +; WIN32-NEXT: movups 40(%ebp), %xmm2 +; WIN32-NEXT: addps %xmm2, %xmm4 +; WIN32-NEXT: movaps %xmm3, %xmm5 +; WIN32-NEXT: movaps (%esp), %xmm2 # 16-byte Reload +; WIN32-NEXT: addps %xmm2, %xmm5 +; WIN32-NEXT: mulps %xmm2, %xmm3 +; WIN32-NEXT: subps %xmm3, %xmm5 +; WIN32-NEXT: movups 56(%ebp), %xmm2 +; WIN32-NEXT: addps %xmm2, %xmm5 +; WIN32-NEXT: movaps %xmm4, %xmm2 ; WIN32-NEXT: movaps %xmm5, %xmm3 -; WIN32-NEXT: movaps (%esp), %xmm1 # 16-byte Reload -; WIN32-NEXT: addps %xmm1, %xmm3 -; WIN32-NEXT: mulps %xmm1, %xmm5 -; WIN32-NEXT: subps %xmm5, %xmm3 -; WIN32-NEXT: movups 56(%ebp), %xmm1 -; WIN32-NEXT: addps %xmm1, %xmm3 -; WIN32-NEXT: movaps %xmm4, %xmm1 ; WIN32-NEXT: movl %ebp, %esp ; WIN32-NEXT: popl %ebp ; WIN32-NEXT: retl @@ -197,44 +197,43 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN32-NEXT: pushl %ebp ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: subl $8, %esp -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: movl %edx, (%esp) # 4-byte Spill ; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: leal (%esi,%eax), %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: leal (%eax,%esi), %ecx ; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %esi, %ecx -; WIN32-NEXT: subl %eax, %ecx +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: subl %esi, %ebx ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: subl %edx, %eax -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: imull %eax, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %ecx, %eax -; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: imull %eax, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: imull %ebx, %esi +; WIN32-NEXT: addl %ebp, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl %ebp, %ebx ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: imull %ebx, %ecx -; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: imull %ebx, %eax +; WIN32-NEXT: addl %esi, %eax ; WIN32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %eax, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; WIN32-NEXT: addl %esi, %edi +; WIN32-NEXT: imull %esi, %edi ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull %ebp, %edx +; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: imull %ebp, %ecx ; WIN32-NEXT: addl %ecx, %edi +; WIN32-NEXT: addl %eax, %edi ; WIN32-NEXT: movl %edi, %ecx ; WIN32-NEXT: addl $8, %esp ; WIN32-NEXT: popl %ebx @@ -243,6 +242,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; WIN64-LABEL: testi32_inp: ; WIN64: # %bb.0: +; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx ; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: # kill: def $esi killed $esi def $rsi @@ -253,36 +253,37 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi ; WIN64-NEXT: leal (%rdx,%rdi), %ebx -; WIN64-NEXT: # kill: def $edx killed $edx killed $rdx -; WIN64-NEXT: subl %edi, %edx -; WIN64-NEXT: leal (%rsi,%r8), %edi +; WIN64-NEXT: movl %edx, %ebp +; WIN64-NEXT: subl %edi, %ebp +; WIN64-NEXT: leal (%rsi,%r8), %edx ; WIN64-NEXT: # kill: def $esi killed $esi killed $rsi ; WIN64-NEXT: subl %r8d, %esi -; WIN64-NEXT: leal (%r9,%r11), %r8d -; WIN64-NEXT: # kill: def $r9d killed $r9d killed $r9 -; WIN64-NEXT: subl %r11d, %r9d -; WIN64-NEXT: movl %eax, %r11d -; WIN64-NEXT: subl %ecx, %r11d -; WIN64-NEXT: imull %r11d, %r9d -; WIN64-NEXT: leal (%r12,%r14), %r11d -; WIN64-NEXT: # kill: def $r12d killed $r12d killed $r12 -; WIN64-NEXT: subl %r14d, %r12d -; WIN64-NEXT: imull %edx, %r12d -; WIN64-NEXT: movl {{[0-9]+}}(%rsp), %edx -; WIN64-NEXT: addl %r9d, %r12d -; WIN64-NEXT: movl %r15d, %r9d -; WIN64-NEXT: subl %edx, %r9d -; WIN64-NEXT: imull %esi, %r9d -; WIN64-NEXT: addl %r12d, %r9d +; WIN64-NEXT: leal (%r9,%r11), %edi +; WIN64-NEXT: movl %r9d, %r8d +; WIN64-NEXT: subl %r11d, %r8d +; WIN64-NEXT: movl %eax, %r9d +; WIN64-NEXT: subl %ecx, %r9d +; WIN64-NEXT: imull %r9d, %r8d +; WIN64-NEXT: leal (%r12,%r14), %r9d +; WIN64-NEXT: movl %r12d, %r11d +; WIN64-NEXT: subl %r14d, %r11d +; WIN64-NEXT: imull %ebp, %r11d +; WIN64-NEXT: movl {{[0-9]+}}(%rsp), %r14d +; WIN64-NEXT: addl %r8d, %r11d +; WIN64-NEXT: movl %r15d, %r8d +; WIN64-NEXT: subl %r14d, %r8d +; WIN64-NEXT: imull %esi, %r8d +; WIN64-NEXT: addl %r11d, %r8d ; WIN64-NEXT: addl %ecx, %eax -; WIN64-NEXT: imull %r8d, %eax -; WIN64-NEXT: imull %ebx, %r11d -; WIN64-NEXT: addl %r11d, %eax -; WIN64-NEXT: addl %r15d, %edx -; WIN64-NEXT: imull %edi, %edx -; WIN64-NEXT: addl %edx, %eax +; WIN64-NEXT: imull %edi, %eax +; WIN64-NEXT: imull %ebx, %r9d ; WIN64-NEXT: addl %r9d, %eax +; WIN64-NEXT: addl %r15d, %r14d +; WIN64-NEXT: imull %edx, %r14d +; WIN64-NEXT: addl %r14d, %eax +; WIN64-NEXT: addl %r8d, %eax ; WIN64-NEXT: popq %rbx +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; ; LINUXOSX-LABEL: testi32_inp: @@ -296,35 +297,35 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d -; LINUXOSX-NEXT: # kill: def $edx killed $edx killed $rdx -; LINUXOSX-NEXT: subl %edi, %edx -; LINUXOSX-NEXT: leal (%rsi,%r8), %edi +; LINUXOSX-NEXT: movl %edx, %r11d +; LINUXOSX-NEXT: subl %edi, %r11d +; LINUXOSX-NEXT: leal (%rsi,%r8), %edx ; LINUXOSX-NEXT: # kill: def $esi killed $esi killed $rsi ; LINUXOSX-NEXT: subl %r8d, %esi -; LINUXOSX-NEXT: leal (%r9,%r12), %r8d -; LINUXOSX-NEXT: # kill: def $r9d killed $r9d killed $r9 -; LINUXOSX-NEXT: subl %r12d, %r9d -; LINUXOSX-NEXT: movl %eax, %r11d -; LINUXOSX-NEXT: subl %ecx, %r11d -; LINUXOSX-NEXT: imull %r11d, %r9d -; LINUXOSX-NEXT: leal (%r13,%r14), %r11d +; LINUXOSX-NEXT: leal (%r9,%r12), %edi +; LINUXOSX-NEXT: movl %r9d, %r8d +; LINUXOSX-NEXT: subl %r12d, %r8d +; LINUXOSX-NEXT: movl %eax, %r9d +; LINUXOSX-NEXT: subl %ecx, %r9d +; LINUXOSX-NEXT: imull %r9d, %r8d +; LINUXOSX-NEXT: leal (%r13,%r14), %r9d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d -; LINUXOSX-NEXT: imull %edx, %r12d -; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx -; LINUXOSX-NEXT: addl %r9d, %r12d -; LINUXOSX-NEXT: movl %r15d, %r9d -; LINUXOSX-NEXT: subl %edx, %r9d -; LINUXOSX-NEXT: imull %esi, %r9d -; LINUXOSX-NEXT: addl %r12d, %r9d +; LINUXOSX-NEXT: imull %r11d, %r12d +; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; LINUXOSX-NEXT: addl %r8d, %r12d +; LINUXOSX-NEXT: movl %r15d, %r8d +; LINUXOSX-NEXT: subl %r11d, %r8d +; LINUXOSX-NEXT: imull %esi, %r8d +; LINUXOSX-NEXT: addl %r12d, %r8d ; LINUXOSX-NEXT: addl %ecx, %eax -; LINUXOSX-NEXT: imull %r8d, %eax -; LINUXOSX-NEXT: imull %r10d, %r11d -; LINUXOSX-NEXT: addl %r11d, %eax -; LINUXOSX-NEXT: addl %r15d, %edx -; LINUXOSX-NEXT: imull %edi, %edx -; LINUXOSX-NEXT: addl %edx, %eax +; LINUXOSX-NEXT: imull %edi, %eax +; LINUXOSX-NEXT: imull %r10d, %r9d ; LINUXOSX-NEXT: addl %r9d, %eax +; LINUXOSX-NEXT: addl %r15d, %r11d +; LINUXOSX-NEXT: imull %edx, %r11d +; LINUXOSX-NEXT: addl %r11d, %eax +; LINUXOSX-NEXT: addl %r8d, %eax ; LINUXOSX-NEXT: retq i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { %x1 = sub i32 %a1, %a2 diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index c9c62343fb61e..f91758b861b4c 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -71,21 +71,20 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: subl $20, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %edx, %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: xorl %edi, %edi +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %eax, %esi -; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: cmovnel %eax, %edi +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: sarl %cl, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovel %ebx, %eax @@ -100,46 +99,46 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: shldl %cl, %esi, %ebx ; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: cmovnel %edi, %eax +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sarl %cl, %edi ; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sarl $31, %esi +; X86-NEXT: sarl %cl, %esi +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: cmovel %esi, %edx ; X86-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: shrdl %cl, %edi, %eax ; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: shrdl %cl, %ebx, %edi ; X86-NEXT: testb $32, %ch -; X86-NEXT: cmovnel %edi, %edx +; X86-NEXT: cmovnel %esi, %edi ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: sarl $31, %esi +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: xorl $2147483647, %ecx # imm = 0x7FFFFFFF ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: notl %edi -; X86-NEXT: cmovel (%esp), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: notl %esi +; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ebp, %esi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %ebp, %edx ; X86-NEXT: sarl $31, %ebp -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF -; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF +; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %ebp ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: cmovel %ebx, %edi +; X86-NEXT: cmovel %ebx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: movl %ebp, 8(%eax) ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload @@ -157,41 +156,41 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec_v4i32: ; X64: # %bb.0: -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X64-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X64-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] +; X64-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,1,1,4,5,6,7] ; X64-NEXT: pslld $23, %xmm1 ; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: cvttps2dq %xmm1, %xmm5 +; X64-NEXT: cvttps2dq %xmm1, %xmm6 ; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: pmuludq %xmm5, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; X64-NEXT: pmuludq %xmm6, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] ; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; X64-NEXT: pmuludq %xmm7, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; X64-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] -; X64-NEXT: movdqa %xmm6, %xmm7 -; X64-NEXT: psrad %xmm5, %xmm7 -; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] -; X64-NEXT: movdqa %xmm1, %xmm5 -; X64-NEXT: psrad %xmm2, %xmm5 -; X64-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; X64-NEXT: movdqa %xmm6, %xmm2 -; X64-NEXT: psrad %xmm3, %xmm2 -; X64-NEXT: psrad %xmm4, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-NEXT: pmuludq %xmm7, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; X64-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,3,3,3,4,5,6,7] +; X64-NEXT: movdqa %xmm2, %xmm7 +; X64-NEXT: psrad %xmm6, %xmm7 +; X64-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] +; X64-NEXT: movdqa %xmm1, %xmm6 +; X64-NEXT: psrad %xmm3, %xmm6 +; X64-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: psrad %xmm4, %xmm3 +; X64-NEXT: psrad %xmm5, %xmm1 +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] ; X64-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-NEXT: pand %xmm1, %xmm6 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm0, %xmm2 -; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: pand %xmm1, %xmm2 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm0, %xmm3 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: por %xmm3, %xmm0 ; X64-NEXT: pandn %xmm0, %xmm1 -; X64-NEXT: por %xmm6, %xmm1 +; X64-NEXT: por %xmm2, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; @@ -215,18 +214,18 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: sarl %cl, %edi +; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testl %edx, %edx +; X86-NEXT: testl %edi, %edi ; X86-NEXT: sets %bl ; X86-NEXT: addl $2147483647, %ebx # imm = 0x7FFFFFFF -; X86-NEXT: cmpl %edi, %edx +; X86-NEXT: cmpl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovel %ebp, %ebx +; X86-NEXT: cmovel %edx, %ebx ; X86-NEXT: movl %edi, %ebp ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %ebp diff --git a/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll b/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll index 63b2a9d415041..731cc95114f77 100644 --- a/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll +++ b/llvm/test/CodeGen/X86/statepoint-cmp-sunk-past-statepoint.ll @@ -61,8 +61,7 @@ zero: ; CHECK: bb.4 ; CHECK: bb.5 ; CHECK: %3:gr64 = COPY %10 -; CHECK-LV: %4:gr64 = COPY killed %10 -; CHECK-LIS: %4:gr64 = COPY %10 +; CHECK: %4:gr64 = COPY killed %10 ; CHECK: %4:gr64 = nuw ADD64ri32 %4, 8, implicit-def dead $eflags ; CHECK: TEST64rr killed %1, %1, implicit-def $eflags ; CHECK: JCC_1 %bb.1, 5, implicit killed $eflags diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll index d43c0b93136a5..787a33aa49b20 100644 --- a/llvm/test/CodeGen/X86/statepoint-live-in.ll +++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll @@ -449,7 +449,7 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movl %edx, %r14d ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %r8d, %r12d +; CHECK-NEXT: movl %r8d, %r15d ; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d @@ -464,10 +464,10 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _bar ## 160-byte Folded Reload @@ -476,25 +476,25 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload ; CHECK-NEXT: addq %rax, %r14 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload -; CHECK-NEXT: addq %r14, %r12 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Folded Reload +; CHECK-NEXT: addq %r14, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r15 -; CHECK-NEXT: addq %r12, %r15 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq %rbx, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp -; CHECK-NEXT: addq %rbx, %rbp +; CHECK-NEXT: addq %r12, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll index 60a3d94ab23b1..5c26e29dce45e 100644 --- a/llvm/test/CodeGen/X86/statepoint-regs.ll +++ b/llvm/test/CodeGen/X86/statepoint-regs.ll @@ -561,7 +561,7 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movl %edx, %r14d ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %r8d, %r12d +; CHECK-NEXT: movl %r8d, %r15d ; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d @@ -576,10 +576,10 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r12d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _bar ## 160-byte Folded Reload @@ -588,25 +588,25 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 % ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload ; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload ; CHECK-NEXT: addq %rax, %r14 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload -; CHECK-NEXT: addq %r14, %r12 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Folded Reload +; CHECK-NEXT: addq %r14, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r15 -; CHECK-NEXT: addq %r12, %r15 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addq %rax, %r12 +; CHECK-NEXT: addq %rbx, %r12 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbp -; CHECK-NEXT: addq %rbx, %rbp +; CHECK-NEXT: addq %r12, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll index 7d1a6171c844a..870912bb6bb1b 100644 --- a/llvm/test/CodeGen/X86/sttni.ll +++ b/llvm/test/CodeGen/X86/sttni.ll @@ -1110,15 +1110,15 @@ entry: define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %iptr, ptr %fptr) nounwind { ; X86-LABEL: pcmpistr_index_flag: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: pcmpistri $24, %xmm1, %xmm0 -; X86-NEXT: setb %bl -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: popl %ebx +; X86-NEXT: setb %al +; X86-NEXT: movl %ecx, (%esi) +; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_index_flag: @@ -1140,13 +1140,13 @@ entry: define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %fptr) nounwind { ; X86-LABEL: pcmpistr_mask_flag: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: pcmpistrm $24, %xmm1, %xmm0 -; X86-NEXT: setb %dl -; X86-NEXT: movdqa %xmm0, (%ecx) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: setb %al +; X86-NEXT: movdqa %xmm0, (%edx) +; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: pcmpistr_mask_flag: diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 1f9153d662019..e0f438eb7cc8f 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -5163,39 +5163,39 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movzbl %dil, %edi ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %edi, %r9d -; SCALAR-NEXT: movq %rcx, %r11 -; SCALAR-NEXT: shrq $40, %r11 +; SCALAR-NEXT: movzbl %r9b, %r11d +; SCALAR-NEXT: shll $8, %r11d +; SCALAR-NEXT: orl %edi, %r11d +; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: shrq $40, %r9 ; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %edi +; SCALAR-NEXT: movzwl %r11w, %edi ; SCALAR-NEXT: orl %r10d, %edi -; SCALAR-NEXT: movq %rcx, %r9 -; SCALAR-NEXT: shrq $56, %r9 +; SCALAR-NEXT: movq %rcx, %r10 +; SCALAR-NEXT: shrq $56, %r10 ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: orq %r8, %rdi ; SCALAR-NEXT: movq %rcx, %r8 ; SCALAR-NEXT: shrq $48, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r8d, %r9d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: orl %r8d, %r10d ; SCALAR-NEXT: movq %rcx, %r8 ; SCALAR-NEXT: shrq $32, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r8d, %r10d +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %r8d, %r9d ; SCALAR-NEXT: movl %ecx, %r11d ; SCALAR-NEXT: shrl $24, %r11d -; SCALAR-NEXT: shll $16, %r9d -; SCALAR-NEXT: movzwl %r10w, %r8d -; SCALAR-NEXT: orl %r9d, %r8d +; SCALAR-NEXT: shll $16, %r10d +; SCALAR-NEXT: movzwl %r9w, %r8d +; SCALAR-NEXT: orl %r10d, %r8d ; SCALAR-NEXT: movl %ecx, %r9d ; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b @@ -5209,39 +5209,39 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movzbl %cl, %ecx ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %ecx, %r9d -; SCALAR-NEXT: movq %rax, %r11 -; SCALAR-NEXT: shrq $40, %r11 +; SCALAR-NEXT: movzbl %r9b, %r11d +; SCALAR-NEXT: shll $8, %r11d +; SCALAR-NEXT: orl %ecx, %r11d +; SCALAR-NEXT: movq %rax, %r9 +; SCALAR-NEXT: shrq $40, %r9 ; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %ecx +; SCALAR-NEXT: movzwl %r11w, %ecx ; SCALAR-NEXT: orl %r10d, %ecx -; SCALAR-NEXT: movq %rax, %r9 -; SCALAR-NEXT: shrq $56, %r9 +; SCALAR-NEXT: movq %rax, %r10 +; SCALAR-NEXT: shrq $56, %r10 ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: orq %r8, %rcx ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $48, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r8d, %r9d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: orl %r8d, %r10d ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r8d, %r10d +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: orl %r8d, %r9d ; SCALAR-NEXT: movl %eax, %r11d ; SCALAR-NEXT: shrl $24, %r11d -; SCALAR-NEXT: shll $16, %r9d -; SCALAR-NEXT: movzwl %r10w, %r8d -; SCALAR-NEXT: orl %r9d, %r8d +; SCALAR-NEXT: shll $16, %r10d +; SCALAR-NEXT: movzwl %r9w, %r8d +; SCALAR-NEXT: orl %r10d, %r8d ; SCALAR-NEXT: movl %eax, %r9d ; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b @@ -7455,9 +7455,9 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: notb %bl -; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r11b +; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill @@ -7474,36 +7474,36 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl 19(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 20(%rdi), %r11d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 21(%rdi), %eax +; SCALAR-NEXT: movzbl 20(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 22(%rdi), %ebp +; SCALAR-NEXT: movzbl 21(%rdi), %ebp ; SCALAR-NEXT: notb %bpl ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 22(%rdi), %ebx +; SCALAR-NEXT: notb %bl +; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 23(%rdi), %r10d ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 24(%rdi), %r9d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 25(%rdi), %r14d +; SCALAR-NEXT: movzbl 25(%rdi), %ecx +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 26(%rdi), %r14d ; SCALAR-NEXT: notb %r14b ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 26(%rdi), %r15d +; SCALAR-NEXT: movzbl 27(%rdi), %r15d ; SCALAR-NEXT: notb %r15b ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 27(%rdi), %r12d +; SCALAR-NEXT: movzbl 28(%rdi), %r12d ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 28(%rdi), %r13d +; SCALAR-NEXT: movzbl 29(%rdi), %r13d ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 29(%rdi), %ecx -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 30(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -7512,46 +7512,47 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movb %dil, 31(%rsi) ; SCALAR-NEXT: movb %al, 30(%rsi) -; SCALAR-NEXT: movb %cl, 29(%rsi) -; SCALAR-NEXT: movb %r13b, 28(%rsi) -; SCALAR-NEXT: movb %r12b, 27(%rsi) -; SCALAR-NEXT: movb %r15b, 26(%rsi) -; SCALAR-NEXT: movb %r14b, 25(%rsi) +; SCALAR-NEXT: movb %r13b, 29(%rsi) +; SCALAR-NEXT: movb %r12b, 28(%rsi) +; SCALAR-NEXT: movb %r15b, 27(%rsi) +; SCALAR-NEXT: movb %r14b, 26(%rsi) +; SCALAR-NEXT: movb %cl, 25(%rsi) ; SCALAR-NEXT: movb %r9b, 24(%rsi) ; SCALAR-NEXT: movb %r10b, 23(%rsi) -; SCALAR-NEXT: movb %bpl, 22(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 22(%rsi) ; SCALAR-NEXT: movb %bpl, 21(%rsi) -; SCALAR-NEXT: movb %r11b, 20(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bpl, 20(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 19(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 18(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 17(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 16(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 16(%rsi) ; SCALAR-NEXT: movb %r8b, 15(%rsi) ; SCALAR-NEXT: movl %r8d, %r14d -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 14(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 13(%rsi) +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 14(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 13(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 12(%rsi) -; SCALAR-NEXT: movb %bl, 11(%rsi) +; SCALAR-NEXT: movb %r11b, 11(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 10(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 9(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 8(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r11b, 7(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r13b, 10(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 9(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 8(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 7(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 6(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 5(%rsi) +; SCALAR-NEXT: movb %r13b, 6(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 5(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r12b, 4(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload @@ -7560,8 +7561,8 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r15b, 2(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r8b, 1(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, (%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, (%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 31(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload @@ -7582,84 +7583,83 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %sil, 23(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 22(%rdx) -; SCALAR-NEXT: movb %bpl, 21(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 20(%rdx) +; SCALAR-NEXT: movb %sil, 21(%rdx) +; SCALAR-NEXT: movb %bpl, 20(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 19(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 18(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 17(%rdx) -; SCALAR-NEXT: movb %r11b, 16(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 17(%rdx) +; SCALAR-NEXT: movb %cl, 16(%rdx) ; SCALAR-NEXT: movb %r14b, 15(%rdx) -; SCALAR-NEXT: movb %r10b, 14(%rdx) -; SCALAR-NEXT: movb %dil, 13(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 12(%rdx) +; SCALAR-NEXT: movb %bl, 14(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 13(%rdx) +; SCALAR-NEXT: movb %al, 12(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 11(%rdx) -; SCALAR-NEXT: movb %r13b, 10(%rdx) -; SCALAR-NEXT: movb %cl, 9(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 8(%rdx) -; SCALAR-NEXT: movb %al, 7(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 6(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 5(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 10(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r14b, 9(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bpl, 8(%rdx) +; SCALAR-NEXT: movb %r11b, 7(%rdx) +; SCALAR-NEXT: movb %r13b, 6(%rdx) +; SCALAR-NEXT: movb %r10b, 5(%rdx) ; SCALAR-NEXT: movb %r12b, 4(%rdx) ; SCALAR-NEXT: movb %r9b, 3(%rdx) ; SCALAR-NEXT: movb %r15b, 2(%rdx) ; SCALAR-NEXT: movb %r8b, 1(%rdx) -; SCALAR-NEXT: movb %bl, (%rdx) -; SCALAR-NEXT: movl %ebx, %edi -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 63(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 62(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 61(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 60(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 59(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 58(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 57(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 56(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 55(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 54(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 53(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 52(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 51(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 50(%rdx) -; SCALAR-NEXT: movb %bpl, 49(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 48(%rdx) -; SCALAR-NEXT: movb %r14b, 47(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 46(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 45(%rdx) -; SCALAR-NEXT: movb %r11b, 44(%rdx) +; SCALAR-NEXT: movb %dil, (%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 63(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 62(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 61(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 60(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 59(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 58(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 57(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 56(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 55(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 54(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 53(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 52(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 51(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 50(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 49(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 48(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 47(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 46(%rdx) +; SCALAR-NEXT: movb %cl, 45(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 44(%rdx) ; SCALAR-NEXT: movb %sil, 43(%rdx) -; SCALAR-NEXT: movb %r13b, 42(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 41(%rdx) -; SCALAR-NEXT: movb %r10b, 40(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 39(%rdx) -; SCALAR-NEXT: movb %cl, 38(%rdx) -; SCALAR-NEXT: movb %al, 37(%rdx) +; SCALAR-NEXT: movb %bl, 42(%rdx) +; SCALAR-NEXT: movb %r14b, 41(%rdx) +; SCALAR-NEXT: movb %bpl, 40(%rdx) +; SCALAR-NEXT: movb %r11b, 39(%rdx) +; SCALAR-NEXT: movb %r13b, 38(%rdx) +; SCALAR-NEXT: movb %r10b, 37(%rdx) ; SCALAR-NEXT: movb %r12b, 36(%rdx) ; SCALAR-NEXT: movb %r9b, 35(%rdx) ; SCALAR-NEXT: movb %r15b, 34(%rdx) diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll index 41d2b1e1939cc..6d77e04504e2d 100644 --- a/llvm/test/CodeGen/X86/swifterror.ll +++ b/llvm/test/CodeGen/X86/swifterror.ll @@ -1708,11 +1708,11 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: .cfi_offset %edi, -16 ; CHECK-i386-NEXT: .cfi_offset %ebx, -12 ; CHECK-i386-NEXT: .cfi_offset %ebp, -8 +; CHECK-i386-NEXT: movl 148(%esp), %esi ; CHECK-i386-NEXT: movl $0, 64(%esp) -; CHECK-i386-NEXT: movl 188(%esp), %ebp ; CHECK-i386-NEXT: movl 192(%esp), %ebx -; CHECK-i386-NEXT: movl 196(%esp), %edi -; CHECK-i386-NEXT: movl 200(%esp), %esi +; CHECK-i386-NEXT: movl 196(%esp), %ebp +; CHECK-i386-NEXT: movl 200(%esp), %edi ; CHECK-i386-NEXT: leal 64(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 52(%esp) ; CHECK-i386-NEXT: movl $0, 48(%esp) @@ -1729,10 +1729,11 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl $0, 4(%esp) ; CHECK-i386-NEXT: movl $1, (%esp) ; CHECK-i386-NEXT: calll _params_in_reg2 -; CHECK-i386-NEXT: movl %esi, 56(%esp) -; CHECK-i386-NEXT: movl %edi, 52(%esp) +; CHECK-i386-NEXT: movl %edi, 56(%esp) +; CHECK-i386-NEXT: movl %ebp, 52(%esp) ; CHECK-i386-NEXT: movl %ebx, 48(%esp) -; CHECK-i386-NEXT: movl %ebp, 44(%esp) +; CHECK-i386-NEXT: movl 188(%esp), %eax +; CHECK-i386-NEXT: movl %eax, 44(%esp) ; CHECK-i386-NEXT: movl 184(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 40(%esp) ; CHECK-i386-NEXT: movl 180(%esp), %eax @@ -1751,8 +1752,7 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl %eax, 12(%esp) ; CHECK-i386-NEXT: movl 152(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 8(%esp) -; CHECK-i386-NEXT: movl 148(%esp), %eax -; CHECK-i386-NEXT: movl %eax, 4(%esp) +; CHECK-i386-NEXT: movl %esi, 4(%esp) ; CHECK-i386-NEXT: leal 88(%esp), %eax ; CHECK-i386-NEXT: movl %eax, (%esp) ; CHECK-i386-NEXT: calll _params_and_return_in_reg2 @@ -1767,8 +1767,8 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-i386-NEXT: movl 104(%esp), %ebp ; CHECK-i386-NEXT: movl 108(%esp), %edi -; CHECK-i386-NEXT: movl 112(%esp), %ebx -; CHECK-i386-NEXT: movl 116(%esp), %esi +; CHECK-i386-NEXT: movl 112(%esp), %esi +; CHECK-i386-NEXT: movl 116(%esp), %ebx ; CHECK-i386-NEXT: leal 64(%esp), %eax ; CHECK-i386-NEXT: movl %eax, 52(%esp) ; CHECK-i386-NEXT: movl $0, 48(%esp) @@ -1786,8 +1786,8 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6 ; CHECK-i386-NEXT: movl $1, (%esp) ; CHECK-i386-NEXT: calll _params_in_reg2 ; CHECK-i386-NEXT: movl 144(%esp), %eax -; CHECK-i386-NEXT: movl %esi, 28(%eax) -; CHECK-i386-NEXT: movl %ebx, 24(%eax) +; CHECK-i386-NEXT: movl %ebx, 28(%eax) +; CHECK-i386-NEXT: movl %esi, 24(%eax) ; CHECK-i386-NEXT: movl %edi, 20(%eax) ; CHECK-i386-NEXT: movl %ebp, 16(%eax) ; CHECK-i386-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll index e6b39eb8b9522..f0479aea1b82c 100644 --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -286,13 +286,13 @@ define i128 @test_i128_1(i128 %a) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: cmpl $1, %eax ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %edx, %edx ; X86-NEXT: movl $1, %edi ; X86-NEXT: cmovnel %eax, %edi ; X86-NEXT: cmovel %ebx, %edi @@ -300,17 +300,17 @@ define i128 @test_i128_1(i128 %a) nounwind { ; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: negl %ebp ; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: sbbl %esi, %ebp ; X86-NEXT: movl $1, %ebp ; X86-NEXT: cmovbl %eax, %ebp ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: cmovbl %esi, %ebx -; X86-NEXT: orl %edx, %eax +; X86-NEXT: cmovbl %edx, %ebx +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: cmovel %edi, %ebp -; X86-NEXT: cmovel %esi, %ebx +; X86-NEXT: cmovel %edx, %ebx ; X86-NEXT: movl %ebx, 4(%eax) ; X86-NEXT: movl %ebp, (%eax) ; X86-NEXT: popl %esi @@ -367,29 +367,29 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %edi, %edx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl %ebx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmovbl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovbl %ebx, %esi -; X86-NEXT: cmpl %ecx, %ebp -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: cmovbl %ecx, %ebp -; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: cmovbl %ebx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmovbl %ebp, %esi +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sbbl %eax, %ebp +; X86-NEXT: cmovbl %edx, %edi +; X86-NEXT: cmovbl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ebp, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -715,26 +715,26 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmoval %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmoval %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmoval %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 4(%edx) -; X86-NEXT: movl %ebp, (%edx) -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %eax, 16(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1316,17 +1316,17 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shrdl $28, %edi, %ecx ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmpl %esi, %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: cmovbl %edx, %ecx -; X86-NEXT: cmovbl %esi, %edi +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: cmovbl %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %ecx, (%eax) diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll index e1538aaaeba65..e4ce08966a894 100644 --- a/llvm/test/CodeGen/X86/umin.ll +++ b/llvm/test/CodeGen/X86/umin.ll @@ -397,26 +397,26 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; X86-NEXT: cmpl %ecx, %eax ; X86-NEXT: cmovbl %eax, %ecx ; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovbl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl %eax, %edx -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, 28(%ecx) +; X86-NEXT: movl %edx, 24(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl %edi, 8(%edx) -; X86-NEXT: movl %ebx, 4(%edx) -; X86-NEXT: movl %ebp, (%edx) -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %eax, 16(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) +; X86-NEXT: movl %ebp, (%ecx) +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -731,17 +731,17 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shrdl $28, %edi, %ecx ; X86-NEXT: sarl $28, %edi -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: cmovbl %edx, %ecx -; X86-NEXT: cmovbl %esi, %edi +; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: cmovbl %edx, %edi ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %ecx, (%eax) diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll index f5248d8679717..ccabb360a990c 100644 --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -93,426 +93,417 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ebp ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: addl %edi, %esi ; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %bl, %ecx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: movl %eax, %edx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: addl %edi, %edx +; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: setb %cl +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ecx, %ebp +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl %ebx, (%esp) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: imull %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %ebp, %esi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ecx +; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 4(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 16(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 20(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 24(%ecx) +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 28(%ecx) +; X86-NEXT: movl %eax, 32(%ecx) +; X86-NEXT: andl $4095, %ebx # imm = 0xFFF +; X86-NEXT: movw %bx, 36(%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edx, %ebp -; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 16(%edx) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 28(%edx) -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: andl $4095, %ecx # imm = 0xFFF -; X86-NEXT: movw %cx, 36(%edx) -; X86-NEXT: movl %edx, %eax ; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -532,57 +523,57 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: movq %r8, %r11 ; X64-NEXT: movq %rcx, %r8 ; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %r10, %rbp +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %rbx, %r15 ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r15, %rbx -; X64-NEXT: adcq %r14, %r12 +; X64-NEXT: adcq %r14, %rbp ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r10d ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r12, %r13 -; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: addq %rbp, %r13 +; X64-NEXT: adcq %r10, %r12 ; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r12, %r10 +; X64-NEXT: addq %r15, %r10 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r10, %r12 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 ; X64-NEXT: adcq %rbp, %rdx ; X64-NEXT: imulq %r9, %r11 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: addq %r13, %r14 -; X64-NEXT: adcq %r15, %r12 +; X64-NEXT: adcq %r12, %r15 ; X64-NEXT: adcq %rdx, %r11 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r13 @@ -596,8 +587,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: adcq %r13, %rdx ; X64-NEXT: imulq %r10, %rcx ; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: addq %r14, %r15 -; X64-NEXT: adcq %r12, %rax +; X64-NEXT: addq %r14, %r12 +; X64-NEXT: adcq %r15, %rax ; X64-NEXT: adcq %r11, %rcx ; X64-NEXT: imulq %r9, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload @@ -609,7 +600,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X64-NEXT: movq %rbx, 8(%rdi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, (%rdi) -; X64-NEXT: movq %r15, 16(%rdi) +; X64-NEXT: movq %r12, 16(%rdi) ; X64-NEXT: movq %rax, 24(%rdi) ; X64-NEXT: movl %esi, 32(%rdi) ; X64-NEXT: shrq $32, %rsi diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index cb4bdd1ede75c..eacc714b49a4d 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -266,20 +266,20 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %edx, %edi ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx @@ -359,22 +359,22 @@ define i64 @func9(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: addl %ebp, %ecx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %edi, %edx diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index 33f43c75cad3d..6b6845147e043 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -52,31 +52,31 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: shrdl $2, %eax, %ecx ; X86-NEXT: shrdl $2, %edx, %eax ; X86-NEXT: shrl $2, %edx -; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %edi, %edx ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: cmovel %eax, %edx @@ -441,30 +441,30 @@ define i64 @func7(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %edx, %esi ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: negl %esi +; X86-NEXT: negl %edi ; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: orl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll index 9800c116ea15f..82603b35ba712 100644 --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -44,87 +44,89 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %ebp +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: seto (%esp) # 1-byte Folded Spill -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: leal (%ecx,%eax), %ecx -; X86-NEXT: seto %bh +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: mull %edi +; X86-NEXT: leal (%ecx,%eax), %esi +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx +; X86-NEXT: leal (%esi,%eax), %esi +; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: leal (%ecx,%eax), %ecx -; X86-NEXT: seto %bl -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %ch ; X86-NEXT: andb %cl, %ch -; X86-NEXT: orb (%esp), %bh # 1-byte Folded Reload -; X86-NEXT: orb %ch, %bh -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload -; X86-NEXT: movb %bh, (%esp) # 1-byte Spill -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: orb %ch, %cl +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: testl %edi, %edi ; X86-NEXT: setne %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: testl %edi, %edi ; X86-NEXT: setne %ch ; X86-NEXT: andb %cl, %ch +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: orb %ch, %bl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload @@ -141,7 +143,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: setne %al ; X86-NEXT: andb %bh, %al ; X86-NEXT: orb %bl, %al -; X86-NEXT: orb (%esp), %al # 1-byte Folded Reload +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload ; X86-NEXT: andb $1, %al ; X86-NEXT: movb %al, 16(%ecx) diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index 6f639880dc574..b1194bedc4e1c 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -1210,21 +1210,21 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r14d +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi ; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx ; CHECK-BASELINE-NEXT: movzbl 2(%r8), %edi ; CHECK-BASELINE-NEXT: movzbl (%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 1(%r8), %ecx -; CHECK-BASELINE-NEXT: movzbl (%rsi), %esi -; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb (%r10), %sil -; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl (%r9), %ebx +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb (%r10), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 1(%r9), %eax ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: andb 1(%r10), %al @@ -1241,34 +1241,34 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: xorb %dl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 4(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: andb 4(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 5(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al ; CHECK-BASELINE-NEXT: andb 5(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r12b, %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 6(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al ; CHECK-BASELINE-NEXT: andb 6(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r15b, %al +; CHECK-BASELINE-NEXT: xorb %r12b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 7(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: andb 7(%r10), %al -; CHECK-BASELINE-NEXT: xorb %r14b, %al +; CHECK-BASELINE-NEXT: xorb %r15b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 8(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al ; CHECK-BASELINE-NEXT: andb 8(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %r14b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 9(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: andb 9(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bl, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%r9), %eax ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -1357,10 +1357,10 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andb 24(%r10), %r14b ; CHECK-BASELINE-NEXT: xorb %al, %r14b ; CHECK-BASELINE-NEXT: movzbl 25(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb 25(%r10), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: movzbl 25(%r9), %ebp +; CHECK-BASELINE-NEXT: xorb %al, %bpl +; CHECK-BASELINE-NEXT: andb 25(%r10), %bpl +; CHECK-BASELINE-NEXT: xorb %al, %bpl ; CHECK-BASELINE-NEXT: movzbl 26(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 26(%r9), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil @@ -1381,11 +1381,11 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 29(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebp +; CHECK-BASELINE-NEXT: movzbl 30(%r8), %ebx ; CHECK-BASELINE-NEXT: movzbl 30(%r9), %eax -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: andb 30(%r10), %al -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %bl, %al ; CHECK-BASELINE-NEXT: movzbl 31(%r8), %r8d ; CHECK-BASELINE-NEXT: movzbl 31(%r9), %r9d ; CHECK-BASELINE-NEXT: xorb %r8b, %r9b @@ -1397,7 +1397,7 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %dl, 28(%r11) ; CHECK-BASELINE-NEXT: movb %sil, 27(%r11) ; CHECK-BASELINE-NEXT: movb %dil, 26(%r11) -; CHECK-BASELINE-NEXT: movb %bl, 25(%r11) +; CHECK-BASELINE-NEXT: movb %bpl, 25(%r11) ; CHECK-BASELINE-NEXT: movb %r14b, 24(%r11) ; CHECK-BASELINE-NEXT: movb %r15b, 23(%r11) ; CHECK-BASELINE-NEXT: movb %r12b, 22(%r11) @@ -1477,21 +1477,21 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebx -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r14d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r15d -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r12d -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %r13d +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r14d +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r15d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r12d +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %r13d +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi ; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx ; CHECK-SSE1-NEXT: movzbl 2(%r8), %edi ; CHECK-SSE1-NEXT: movzbl (%r8), %eax ; CHECK-SSE1-NEXT: movzbl 1(%r8), %ecx -; CHECK-SSE1-NEXT: movzbl (%rsi), %esi -; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb (%r10), %sil -; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl (%r9), %ebx +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb (%r10), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 1(%r9), %eax ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: andb 1(%r10), %al @@ -1508,34 +1508,34 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: xorb %dl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 4(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: andb 4(%r10), %al -; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 5(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: xorb %r13b, %al ; CHECK-SSE1-NEXT: andb 5(%r10), %al -; CHECK-SSE1-NEXT: xorb %r12b, %al +; CHECK-SSE1-NEXT: xorb %r13b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 6(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: xorb %r12b, %al ; CHECK-SSE1-NEXT: andb 6(%r10), %al -; CHECK-SSE1-NEXT: xorb %r15b, %al +; CHECK-SSE1-NEXT: xorb %r12b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 7(%r9), %eax -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: andb 7(%r10), %al -; CHECK-SSE1-NEXT: xorb %r14b, %al +; CHECK-SSE1-NEXT: xorb %r15b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 8(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %r14b, %al ; CHECK-SSE1-NEXT: andb 8(%r10), %al -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %r14b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 9(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: andb 9(%r10), %al -; CHECK-SSE1-NEXT: xorb %bl, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%r9), %eax ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -1624,10 +1624,10 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: andb 24(%r10), %r14b ; CHECK-SSE1-NEXT: xorb %al, %r14b ; CHECK-SSE1-NEXT: movzbl 25(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb 25(%r10), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: movzbl 25(%r9), %ebp +; CHECK-SSE1-NEXT: xorb %al, %bpl +; CHECK-SSE1-NEXT: andb 25(%r10), %bpl +; CHECK-SSE1-NEXT: xorb %al, %bpl ; CHECK-SSE1-NEXT: movzbl 26(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 26(%r9), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil @@ -1648,11 +1648,11 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 29(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebp +; CHECK-SSE1-NEXT: movzbl 30(%r8), %ebx ; CHECK-SSE1-NEXT: movzbl 30(%r9), %eax -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: andb 30(%r10), %al -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %bl, %al ; CHECK-SSE1-NEXT: movzbl 31(%r8), %r8d ; CHECK-SSE1-NEXT: movzbl 31(%r9), %r9d ; CHECK-SSE1-NEXT: xorb %r8b, %r9b @@ -1664,7 +1664,7 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %dl, 28(%r11) ; CHECK-SSE1-NEXT: movb %sil, 27(%r11) ; CHECK-SSE1-NEXT: movb %dil, 26(%r11) -; CHECK-SSE1-NEXT: movb %bl, 25(%r11) +; CHECK-SSE1-NEXT: movb %bpl, 25(%r11) ; CHECK-SSE1-NEXT: movb %r14b, 24(%r11) ; CHECK-SSE1-NEXT: movb %r15b, 23(%r11) ; CHECK-SSE1-NEXT: movb %r12b, 22(%r11) @@ -3231,10 +3231,12 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rdx, %r13 -; CHECK-BASELINE-NEXT: movq %rsi, %r12 -; CHECK-BASELINE-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %r15d +; CHECK-BASELINE-NEXT: movq %rcx, %r12 +; CHECK-BASELINE-NEXT: movq %rdx, %r15 +; CHECK-BASELINE-NEXT: movq %rsi, %r14 +; CHECK-BASELINE-NEXT: movq %rdi, %r13 +; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 13(%rdx), %eax @@ -3248,197 +3250,199 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d ; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d ; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %edi -; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %esi -; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %edx -; CHECK-BASELINE-NEXT: movzbl (%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 1(%r13), %ebx -; CHECK-BASELINE-NEXT: movzbl (%r12), %r14d -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: andb (%rcx), %r14b -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 1(%r12), %eax -; CHECK-BASELINE-NEXT: xorb %bl, %al -; CHECK-BASELINE-NEXT: andb 1(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %bl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 2(%r12), %eax -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: andb 2(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi +; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx +; CHECK-BASELINE-NEXT: movzbl (%rdx), %r11d +; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %edx +; CHECK-BASELINE-NEXT: movzbl (%r14), %ebx +; CHECK-BASELINE-NEXT: xorb %r11b, %bl +; CHECK-BASELINE-NEXT: andb (%r12), %bl +; CHECK-BASELINE-NEXT: xorb %r11b, %bl +; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 1(%r14), %r11d +; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: andb 1(%r12), %r11b +; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 2(%r14), %edx +; CHECK-BASELINE-NEXT: xorb %cl, %dl +; CHECK-BASELINE-NEXT: andb 2(%r12), %dl +; CHECK-BASELINE-NEXT: xorb %cl, %dl +; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 3(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 3(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 4(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: andb 3(%rcx), %al +; CHECK-BASELINE-NEXT: andb 4(%r12), %al ; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 4(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 5(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: andb 4(%rcx), %al +; CHECK-BASELINE-NEXT: andb 5(%r12), %al ; CHECK-BASELINE-NEXT: xorb %dil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 5(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %bpl, %al -; CHECK-BASELINE-NEXT: andb 5(%rcx), %al +; CHECK-BASELINE-NEXT: andb 6(%r12), %al ; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 6(%r12), %eax -; CHECK-BASELINE-NEXT: xorb %r11b, %al -; CHECK-BASELINE-NEXT: andb 6(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %r11b, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 7(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 7(%rcx), %al +; CHECK-BASELINE-NEXT: andb 7(%r12), %al ; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 8(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r9b, %al -; CHECK-BASELINE-NEXT: andb 8(%rcx), %al +; CHECK-BASELINE-NEXT: andb 8(%r12), %al ; CHECK-BASELINE-NEXT: xorb %r9b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%r12), %eax +; CHECK-BASELINE-NEXT: movzbl 9(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: andb 9(%rcx), %al +; CHECK-BASELINE-NEXT: andb 9(%r12), %al ; CHECK-BASELINE-NEXT: xorb %r8b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 10(%r12), %edx +; CHECK-BASELINE-NEXT: movzbl 10(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 10(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 11(%r12), %edx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 10(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 11(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 11(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 12(%r12), %edx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 11(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 12(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 12(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 13(%r12), %edx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 12(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 13(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 13(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 14(%r12), %edx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 13(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 14(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 14(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 15(%r12), %eax -; CHECK-BASELINE-NEXT: xorb %r15b, %al -; CHECK-BASELINE-NEXT: andb 15(%rcx), %al -; CHECK-BASELINE-NEXT: xorb %r15b, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 16(%r12), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 16(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 17(%r12), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 17(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 18(%r12), %edx -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 18(%rcx), %dl -; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 19(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 19(%r12), %r15d -; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: andb 19(%rcx), %r15b -; CHECK-BASELINE-NEXT: movq %rcx, %rdx -; CHECK-BASELINE-NEXT: xorb %al, %r15b -; CHECK-BASELINE-NEXT: movzbl 20(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 20(%r12), %r14d -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: andb 20(%rcx), %r14b -; CHECK-BASELINE-NEXT: xorb %al, %r14b -; CHECK-BASELINE-NEXT: movzbl 21(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 21(%r12), %ebp +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 14(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 15(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 15(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 16(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 16(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 16(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 17(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 17(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 17(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 18(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 18(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 18(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 19(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 19(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 19(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 20(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 20(%r14), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 20(%r12), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 21(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 21(%r14), %ebp ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 21(%rcx), %bpl +; CHECK-BASELINE-NEXT: andb 21(%r12), %bpl ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl 22(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 22(%r12), %ebx +; CHECK-BASELINE-NEXT: movzbl 22(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebx ; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb 22(%rcx), %bl +; CHECK-BASELINE-NEXT: andb 22(%r12), %bl ; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: movzbl 23(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 23(%r12), %r11d +; CHECK-BASELINE-NEXT: movzbl 23(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 23(%r14), %r11d ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: andb 23(%rcx), %r11b +; CHECK-BASELINE-NEXT: andb 23(%r12), %r11b ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: movzbl 24(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 24(%r12), %r10d -; CHECK-BASELINE-NEXT: xorb %al, %r10b -; CHECK-BASELINE-NEXT: andb 24(%rcx), %r10b -; CHECK-BASELINE-NEXT: xorb %al, %r10b -; CHECK-BASELINE-NEXT: movzbl 25(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 25(%r12), %r9d +; CHECK-BASELINE-NEXT: movzbl 24(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 24(%r14), %r9d ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: andb 25(%rcx), %r9b +; CHECK-BASELINE-NEXT: andb 24(%r12), %r9b ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: movzbl 26(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 26(%r12), %r8d +; CHECK-BASELINE-NEXT: movzbl 25(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 25(%r14), %r8d ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: andb 26(%rcx), %r8b +; CHECK-BASELINE-NEXT: andb 25(%r12), %r8b ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: movzbl 27(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 27(%r12), %edi +; CHECK-BASELINE-NEXT: movzbl 26(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 26(%r14), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: andb 27(%rcx), %dil +; CHECK-BASELINE-NEXT: andb 26(%r12), %dil ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: movzbl 28(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 28(%r12), %esi +; CHECK-BASELINE-NEXT: movzbl 27(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 27(%r14), %esi ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb 28(%rcx), %sil +; CHECK-BASELINE-NEXT: andb 27(%r12), %sil ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movzbl 29(%r13), %eax -; CHECK-BASELINE-NEXT: movzbl 29(%r12), %ecx +; CHECK-BASELINE-NEXT: movzbl 28(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 28(%r14), %edx +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: andb 28(%r12), %dl +; CHECK-BASELINE-NEXT: xorb %al, %dl +; CHECK-BASELINE-NEXT: movzbl 29(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 29(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 29(%rdx), %cl +; CHECK-BASELINE-NEXT: andb 29(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r13), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 30(%r12), %eax -; CHECK-BASELINE-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: andb 30(%rdx), %al -; CHECK-BASELINE-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movzbl 31(%r13), %r13d -; CHECK-BASELINE-NEXT: movzbl 31(%r12), %r12d -; CHECK-BASELINE-NEXT: xorb %r13b, %r12b -; CHECK-BASELINE-NEXT: andb 31(%rdx), %r12b -; CHECK-BASELINE-NEXT: xorb %r13b, %r12b -; CHECK-BASELINE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; CHECK-BASELINE-NEXT: movb %r12b, 31(%r13) +; CHECK-BASELINE-NEXT: movzbl 30(%r15), %r10d +; CHECK-BASELINE-NEXT: movzbl 30(%r14), %eax +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: andb 30(%r12), %al +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: movzbl 31(%r15), %r10d +; CHECK-BASELINE-NEXT: movzbl 31(%r14), %r14d +; CHECK-BASELINE-NEXT: xorb %r10b, %r14b +; CHECK-BASELINE-NEXT: andb 31(%r12), %r14b +; CHECK-BASELINE-NEXT: xorb %r10b, %r14b +; CHECK-BASELINE-NEXT: movb %r14b, 31(%r13) ; CHECK-BASELINE-NEXT: movb %al, 30(%r13) ; CHECK-BASELINE-NEXT: movb %cl, 29(%r13) -; CHECK-BASELINE-NEXT: movb %sil, 28(%r13) -; CHECK-BASELINE-NEXT: movb %dil, 27(%r13) -; CHECK-BASELINE-NEXT: movb %r8b, 26(%r13) -; CHECK-BASELINE-NEXT: movb %r9b, 25(%r13) -; CHECK-BASELINE-NEXT: movb %r10b, 24(%r13) +; CHECK-BASELINE-NEXT: movb %dl, 28(%r13) +; CHECK-BASELINE-NEXT: movb %sil, 27(%r13) +; CHECK-BASELINE-NEXT: movb %dil, 26(%r13) +; CHECK-BASELINE-NEXT: movb %r8b, 25(%r13) +; CHECK-BASELINE-NEXT: movb %r9b, 24(%r13) ; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13) ; CHECK-BASELINE-NEXT: movb %bl, 22(%r13) ; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13) -; CHECK-BASELINE-NEXT: movb %r14b, 20(%r13) -; CHECK-BASELINE-NEXT: movb %r15b, 19(%r13) +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: movb %al, 20(%r13) +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: movb %al, 19(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: movb %al, 18(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload @@ -3494,10 +3498,12 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rdx, %r13 -; CHECK-SSE1-NEXT: movq %rsi, %r12 -; CHECK-SSE1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%rdx), %r15d +; CHECK-SSE1-NEXT: movq %rcx, %r12 +; CHECK-SSE1-NEXT: movq %rdx, %r15 +; CHECK-SSE1-NEXT: movq %rsi, %r14 +; CHECK-SSE1-NEXT: movq %rdi, %r13 +; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 13(%rdx), %eax @@ -3511,197 +3517,199 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d ; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d ; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r11d -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 4(%rdx), %edi -; CHECK-SSE1-NEXT: movzbl 3(%rdx), %esi -; CHECK-SSE1-NEXT: movzbl 2(%rdx), %edx -; CHECK-SSE1-NEXT: movzbl (%r13), %eax -; CHECK-SSE1-NEXT: movzbl 1(%r13), %ebx -; CHECK-SSE1-NEXT: movzbl (%r12), %r14d -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: andb (%rcx), %r14b -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 1(%r12), %eax -; CHECK-SSE1-NEXT: xorb %bl, %al -; CHECK-SSE1-NEXT: andb 1(%rcx), %al -; CHECK-SSE1-NEXT: xorb %bl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 2(%r12), %eax -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: andb 2(%rcx), %al -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi +; CHECK-SSE1-NEXT: movzbl 3(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx +; CHECK-SSE1-NEXT: movzbl (%rdx), %r11d +; CHECK-SSE1-NEXT: movzbl 1(%rdx), %edx +; CHECK-SSE1-NEXT: movzbl (%r14), %ebx +; CHECK-SSE1-NEXT: xorb %r11b, %bl +; CHECK-SSE1-NEXT: andb (%r12), %bl +; CHECK-SSE1-NEXT: xorb %r11b, %bl +; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 1(%r14), %r11d +; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: andb 1(%r12), %r11b +; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 2(%r14), %edx +; CHECK-SSE1-NEXT: xorb %cl, %dl +; CHECK-SSE1-NEXT: andb 2(%r12), %dl +; CHECK-SSE1-NEXT: xorb %cl, %dl +; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 3(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 3(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 4(%r14), %eax ; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: andb 3(%rcx), %al +; CHECK-SSE1-NEXT: andb 4(%r12), %al ; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 4(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 5(%r14), %eax ; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: andb 4(%rcx), %al +; CHECK-SSE1-NEXT: andb 5(%r12), %al ; CHECK-SSE1-NEXT: xorb %dil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 5(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax ; CHECK-SSE1-NEXT: xorb %bpl, %al -; CHECK-SSE1-NEXT: andb 5(%rcx), %al +; CHECK-SSE1-NEXT: andb 6(%r12), %al ; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 6(%r12), %eax -; CHECK-SSE1-NEXT: xorb %r11b, %al -; CHECK-SSE1-NEXT: andb 6(%rcx), %al -; CHECK-SSE1-NEXT: xorb %r11b, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 7(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 7(%rcx), %al +; CHECK-SSE1-NEXT: andb 7(%r12), %al ; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 8(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r9b, %al -; CHECK-SSE1-NEXT: andb 8(%rcx), %al +; CHECK-SSE1-NEXT: andb 8(%r12), %al ; CHECK-SSE1-NEXT: xorb %r9b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%r12), %eax +; CHECK-SSE1-NEXT: movzbl 9(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: andb 9(%rcx), %al +; CHECK-SSE1-NEXT: andb 9(%r12), %al ; CHECK-SSE1-NEXT: xorb %r8b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 10(%r12), %edx +; CHECK-SSE1-NEXT: movzbl 10(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 10(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 11(%r12), %edx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 10(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 11(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 11(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 12(%r12), %edx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 11(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 12(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 12(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 13(%r12), %edx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 12(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 13(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 13(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 14(%r12), %edx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 13(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 14(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 14(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 15(%r12), %eax -; CHECK-SSE1-NEXT: xorb %r15b, %al -; CHECK-SSE1-NEXT: andb 15(%rcx), %al -; CHECK-SSE1-NEXT: xorb %r15b, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 16(%r12), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 16(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 17(%r12), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 17(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 18(%r12), %edx -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 18(%rcx), %dl -; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 19(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 19(%r12), %r15d -; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: andb 19(%rcx), %r15b -; CHECK-SSE1-NEXT: movq %rcx, %rdx -; CHECK-SSE1-NEXT: xorb %al, %r15b -; CHECK-SSE1-NEXT: movzbl 20(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 20(%r12), %r14d -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: andb 20(%rcx), %r14b -; CHECK-SSE1-NEXT: xorb %al, %r14b -; CHECK-SSE1-NEXT: movzbl 21(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 21(%r12), %ebp +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 14(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 15(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 15(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 16(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 16(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 16(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 17(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 17(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 17(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 18(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 18(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 18(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 19(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 19(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 19(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 20(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 20(%r14), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 20(%r12), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 21(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 21(%r14), %ebp ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 21(%rcx), %bpl +; CHECK-SSE1-NEXT: andb 21(%r12), %bpl ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl 22(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 22(%r12), %ebx +; CHECK-SSE1-NEXT: movzbl 22(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebx ; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb 22(%rcx), %bl +; CHECK-SSE1-NEXT: andb 22(%r12), %bl ; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: movzbl 23(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 23(%r12), %r11d +; CHECK-SSE1-NEXT: movzbl 23(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 23(%r14), %r11d ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: andb 23(%rcx), %r11b +; CHECK-SSE1-NEXT: andb 23(%r12), %r11b ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: movzbl 24(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 24(%r12), %r10d -; CHECK-SSE1-NEXT: xorb %al, %r10b -; CHECK-SSE1-NEXT: andb 24(%rcx), %r10b -; CHECK-SSE1-NEXT: xorb %al, %r10b -; CHECK-SSE1-NEXT: movzbl 25(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 25(%r12), %r9d +; CHECK-SSE1-NEXT: movzbl 24(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 24(%r14), %r9d ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: andb 25(%rcx), %r9b +; CHECK-SSE1-NEXT: andb 24(%r12), %r9b ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: movzbl 26(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 26(%r12), %r8d +; CHECK-SSE1-NEXT: movzbl 25(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 25(%r14), %r8d ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: andb 26(%rcx), %r8b +; CHECK-SSE1-NEXT: andb 25(%r12), %r8b ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: movzbl 27(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 27(%r12), %edi +; CHECK-SSE1-NEXT: movzbl 26(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 26(%r14), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: andb 27(%rcx), %dil +; CHECK-SSE1-NEXT: andb 26(%r12), %dil ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: movzbl 28(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 28(%r12), %esi +; CHECK-SSE1-NEXT: movzbl 27(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 27(%r14), %esi ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb 28(%rcx), %sil +; CHECK-SSE1-NEXT: andb 27(%r12), %sil ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movzbl 29(%r13), %eax -; CHECK-SSE1-NEXT: movzbl 29(%r12), %ecx +; CHECK-SSE1-NEXT: movzbl 28(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 28(%r14), %edx +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: andb 28(%r12), %dl +; CHECK-SSE1-NEXT: xorb %al, %dl +; CHECK-SSE1-NEXT: movzbl 29(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 29(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 29(%rdx), %cl +; CHECK-SSE1-NEXT: andb 29(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r13), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 30(%r12), %eax -; CHECK-SSE1-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-SSE1-NEXT: andb 30(%rdx), %al -; CHECK-SSE1-NEXT: xorb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movzbl 31(%r13), %r13d -; CHECK-SSE1-NEXT: movzbl 31(%r12), %r12d -; CHECK-SSE1-NEXT: xorb %r13b, %r12b -; CHECK-SSE1-NEXT: andb 31(%rdx), %r12b -; CHECK-SSE1-NEXT: xorb %r13b, %r12b -; CHECK-SSE1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; CHECK-SSE1-NEXT: movb %r12b, 31(%r13) +; CHECK-SSE1-NEXT: movzbl 30(%r15), %r10d +; CHECK-SSE1-NEXT: movzbl 30(%r14), %eax +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: andb 30(%r12), %al +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: movzbl 31(%r15), %r10d +; CHECK-SSE1-NEXT: movzbl 31(%r14), %r14d +; CHECK-SSE1-NEXT: xorb %r10b, %r14b +; CHECK-SSE1-NEXT: andb 31(%r12), %r14b +; CHECK-SSE1-NEXT: xorb %r10b, %r14b +; CHECK-SSE1-NEXT: movb %r14b, 31(%r13) ; CHECK-SSE1-NEXT: movb %al, 30(%r13) ; CHECK-SSE1-NEXT: movb %cl, 29(%r13) -; CHECK-SSE1-NEXT: movb %sil, 28(%r13) -; CHECK-SSE1-NEXT: movb %dil, 27(%r13) -; CHECK-SSE1-NEXT: movb %r8b, 26(%r13) -; CHECK-SSE1-NEXT: movb %r9b, 25(%r13) -; CHECK-SSE1-NEXT: movb %r10b, 24(%r13) +; CHECK-SSE1-NEXT: movb %dl, 28(%r13) +; CHECK-SSE1-NEXT: movb %sil, 27(%r13) +; CHECK-SSE1-NEXT: movb %dil, 26(%r13) +; CHECK-SSE1-NEXT: movb %r8b, 25(%r13) +; CHECK-SSE1-NEXT: movb %r9b, 24(%r13) ; CHECK-SSE1-NEXT: movb %r11b, 23(%r13) ; CHECK-SSE1-NEXT: movb %bl, 22(%r13) ; CHECK-SSE1-NEXT: movb %bpl, 21(%r13) -; CHECK-SSE1-NEXT: movb %r14b, 20(%r13) -; CHECK-SSE1-NEXT: movb %r15b, 19(%r13) +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-SSE1-NEXT: movb %al, 20(%r13) +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; CHECK-SSE1-NEXT: movb %al, 19(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: movb %al, 18(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index 8df101852f06b..ebb5e135eacd0 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -45,15 +45,14 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $16, %esp -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %eax ; X86-NEXT: shldl %cl, %esi, %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %eax, %edx ; X86-NEXT: cmovnel %ebx, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill @@ -62,31 +61,32 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movb %ch, %cl ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: testb $32, %cl +; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: cmovnel %ebx, %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: shrl %cl, %edi -; X86-NEXT: testb $32, %cl +; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovel %edi, %ebx ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movb %ch, %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shrdl %cl, %ebp, %eax -; X86-NEXT: testb $32, %ch +; X86-NEXT: testb $32, %cl ; X86-NEXT: cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movb %ch, %cl ; X86-NEXT: shrdl %cl, %edx, %ebp -; X86-NEXT: testb $32, {{[0-9]+}}(%esp) +; X86-NEXT: testb $32, %ch ; X86-NEXT: cmovnel %edi, %ebp ; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -169,7 +169,7 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx @@ -178,36 +178,38 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: cmpl %ebp, %ebx +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovnel %edx, %esi ; X86-NEXT: movl $-1, %ebx -; X86-NEXT: cmovnel %ebx, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movb %dl, %cl -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movb %ah, %cl +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: shrl %cl, %ebp ; X86-NEXT: cmpl %ebp, %edi -; X86-NEXT: cmovnel %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: cmovnel %ebx, %edx +; X86-NEXT: movl $-1, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: shrl %cl, %ebp +; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmovnel %eax, %edi +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnel %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, 12(%ecx) -; X86-NEXT: movl %edx, 8(%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpl %eax, %ebx +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovnel %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -330,15 +332,16 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovnel %esi, %eax ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl $65535, %eax # imm = 0xFFFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shll %cl, %ebp ; X86-NEXT: movzwl %bp, %edx ; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmpw %dx, %si ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: cmovnel %eax, %ebp ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: shll %cl, %ebx ; X86-NEXT: movzwl %bx, %esi @@ -355,26 +358,26 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: cmpw %ax, %dx ; X86-NEXT: cmovnel %esi, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movzwl %dx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movzwl %si, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %si +; X86-NEXT: cmpw %ax, %dx ; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %edx +; X86-NEXT: cmovnel %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movzwl %ax, %edx +; X86-NEXT: shrl %cl, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpw %si, %cx +; X86-NEXT: cmpw %dx, %cx ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) -; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movw %si, 12(%ecx) ; X86-NEXT: movw %di, 10(%ecx) ; X86-NEXT: movw %bx, 8(%ecx) ; X86-NEXT: movw %bp, 6(%ecx) diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll index 61e648eec855f..99a3821bb9ba9 100644 --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -247,7 +247,7 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -259,7 +259,7 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm6 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -271,11 +271,11 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -295,18 +295,18 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8: @@ -511,7 +511,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -523,7 +523,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm6 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -535,11 +535,11 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -559,18 +559,18 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: @@ -713,7 +713,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: movd %eax, %xmm3 ; SSE3-NEXT: andl $31, %r15d ; SSE3-NEXT: movzbl 32(%rsp,%r15), %eax -; SSE3-NEXT: movd %eax, %xmm5 +; SSE3-NEXT: movd %eax, %xmm6 ; SSE3-NEXT: andl $31, %r14d ; SSE3-NEXT: movzbl 64(%rsp,%r14), %eax ; SSE3-NEXT: movd %eax, %xmm7 @@ -722,7 +722,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: movd %eax, %xmm8 ; SSE3-NEXT: andl $31, %r11d ; SSE3-NEXT: movzbl 128(%rsp,%r11), %eax -; SSE3-NEXT: movd %eax, %xmm6 +; SSE3-NEXT: movd %eax, %xmm5 ; SSE3-NEXT: andl $31, %r10d ; SSE3-NEXT: movzbl 160(%rsp,%r10), %eax ; SSE3-NEXT: movd %eax, %xmm9 @@ -731,10 +731,10 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: movd %eax, %xmm10 ; SSE3-NEXT: andl $31, %r8d ; SSE3-NEXT: movzbl 224(%rsp,%r8), %eax -; SSE3-NEXT: movd %eax, %xmm11 +; SSE3-NEXT: movd %eax, %xmm12 ; SSE3-NEXT: andl $31, %edi ; SSE3-NEXT: movzbl 256(%rsp,%rdi), %eax -; SSE3-NEXT: movd %eax, %xmm12 +; SSE3-NEXT: movd %eax, %xmm11 ; SSE3-NEXT: andl $31, %esi ; SSE3-NEXT: movzbl 288(%rsp,%rsi), %eax ; SSE3-NEXT: movd %eax, %xmm13 @@ -751,18 +751,18 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE3-NEXT: popq %rbx ; SSE3-NEXT: popq %r12 @@ -845,7 +845,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: andl $31, %r15d ; SSSE3-NEXT: movzbl 32(%rsp,%r15), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: andl $31, %r14d ; SSSE3-NEXT: movzbl 64(%rsp,%r14), %eax ; SSSE3-NEXT: movd %eax, %xmm7 @@ -854,7 +854,7 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: movd %eax, %xmm8 ; SSSE3-NEXT: andl $31, %r11d ; SSSE3-NEXT: movzbl 128(%rsp,%r11), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: andl $31, %r10d ; SSSE3-NEXT: movzbl 160(%rsp,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm9 @@ -863,10 +863,10 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $31, %r8d ; SSSE3-NEXT: movzbl 224(%rsp,%r8), %eax -; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: andl $31, %edi ; SSSE3-NEXT: movzbl 256(%rsp,%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: andl $31, %esi ; SSSE3-NEXT: movzbl 288(%rsp,%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm13 @@ -883,18 +883,18 @@ define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %in ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll index b55fd27d4036c..032ffb0d0bf7d 100644 --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -1093,21 +1093,21 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm4 +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 +; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512F-NEXT: andl $63, %esi ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vpextrd $3, %xmm5, %eax ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 @@ -1123,44 +1123,44 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: vpextrd $2, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm5, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: vpextrd $2, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm5, %xmm1 +; AVX512F-NEXT: vpextrd $3, %xmm5, %eax +; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm1 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 ; AVX512F-NEXT: vmovd %xmm1, %eax @@ -1344,21 +1344,21 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 +; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: andl $63, %esi ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 @@ -1374,44 +1374,44 @@ define void @var_cvt_shuffle_v64f32_v64i8_idx(ptr %dst, <64 x i8> %src, i32 %b) ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm5, %eax +; AVX512BW-NEXT: vmovd %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm5, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax +; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm5, %xmm1 +; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax +; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm1 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 ; AVX512BW-NEXT: vmovd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll index a7ac1a19ea024..17c5ff7955106 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -156,15 +156,14 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: movl %eax, %esi ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovsd %xmm3, (%esp) -; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: xorl %ebx, %ebx ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) @@ -203,10 +202,11 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: movl %eax, %edi ; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) @@ -234,14 +234,14 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %esi ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi -; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX512VL-32-NEXT: shll $31, %edi -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %edx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -264,10 +264,10 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX512VL-32-NEXT: shll $31, %ebx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: shll $31, %edi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 -; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -448,16 +448,15 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 ; AVX512VL-32-NEXT: setae %al ; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: movl %eax, %esi ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, (%esp) ; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: xorl %ebx, %ebx ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 -; AVX512VL-32-NEXT: setae %al -; AVX512VL-32-NEXT: kmovw %eax, %k1 -; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm4 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) @@ -495,10 +494,11 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 -; AVX512VL-32-NEXT: setae %bl -; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: movl %eax, %edi ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) @@ -526,14 +526,14 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) ; AVX512VL-32-NEXT: wait +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %esi ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %esi -; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VL-32-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX512VL-32-NEXT: shll $31, %edi -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 -; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 ; AVX512VL-32-NEXT: shll $31, %edx ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -556,10 +556,10 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { ; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax ; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX512VL-32-NEXT: shll $31, %ebx -; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: shll $31, %edi +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edi ; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 -; AVX512VL-32-NEXT: vpinsrd $3, %ebx, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, %edi, %xmm3, %xmm3 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 88e4b11b66f54..b275814cc8033 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -423,148 +423,150 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %esi, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero ; SSE2-NEXT: pmuludq %xmm6, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 -; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm4, %xmm8 ; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm5, %xmm9 ; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: paddd %xmm8, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: psubd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: psubd %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %xmm0, 16(%rcx) ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movd %esi, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero ; SSSE3-NEXT: pmuludq %xmm6, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: pxor %xmm8, %xmm8 ; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 -; SSSE3-NEXT: pand %xmm1, %xmm8 +; SSSE3-NEXT: pand %xmm4, %xmm8 ; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9 ; SSSE3-NEXT: pand %xmm5, %xmm9 ; SSSE3-NEXT: paddd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSSE3-NEXT: pmuludq %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: pmuludq %xmm8, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSSE3-NEXT: psubd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSSE3-NEXT: movdqa %xmm1, (%rcx) -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: movdqa %xmm4, (%rcx) +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSSE3-NEXT: pxor %xmm5, %xmm4 ; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 ; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: paddd %xmm8, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: paddd %xmm8, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSSE3-NEXT: pmuludq %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSSE3-NEXT: psubd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSSE3-NEXT: pmuludq %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSSE3-NEXT: psubd %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %xmm0, 16(%rcx) ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm5, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v6i32: @@ -579,25 +581,25 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmuldq %xmm2, %xmm0 ; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: movd %r9d, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pmuldq %xmm3, %xmm4 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm3 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx -; SSE41-NEXT: pinsrd $1, %edx, %xmm5 +; SSE41-NEXT: pinsrd $1, %edx, %xmm3 +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi +; SSE41-NEXT: pinsrd $1, %esi, %xmm5 ; SSE41-NEXT: pmulld %xmm3, %xmm5 ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE41-NEXT: movd %ecx, %xmm3 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; SSE41-NEXT: movd %edx, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE41-NEXT: movd %edx, %xmm6 +; SSE41-NEXT: movd %esi, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] ; SSE41-NEXT: pmuldq %xmm3, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] -; SSE41-NEXT: movq %xmm5, 16(%rsi) +; SSE41-NEXT: movq %xmm5, 16(%rcx) ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 @@ -608,7 +610,7 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] ; SSE41-NEXT: pmulld %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, (%rsi) +; SSE41-NEXT: movdqa %xmm1, (%rcx) ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm1 @@ -1887,103 +1889,103 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] -; SSE2-NEXT: pmulhw %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm8 -; SSE2-NEXT: psrlw $8, %xmm8 ; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] +; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm8 +; SSE2-NEXT: psrlw $8, %xmm8 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSE2-NEXT: pmulhw %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: psrlw $8, %xmm10 -; SSE2-NEXT: packuswb %xmm8, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm8, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: packuswb %xmm9, %xmm7 +; SSE2-NEXT: pmulhw %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: psrlw $8, %xmm11 +; SSE2-NEXT: packuswb %xmm8, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm7 +; SSE2-NEXT: packuswb %xmm10, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm10, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE2-NEXT: pcmpeqb %xmm11, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSE2-NEXT: pmulhw %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm10, %xmm2 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmulhw %xmm9, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: psrlw $8, %xmm9 -; SSE2-NEXT: packuswb %xmm2, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pmulhw %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: psrlw $8, %xmm8 +; SSE2-NEXT: packuswb %xmm2, %xmm8 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm6 ; SSE2-NEXT: packuswb %xmm10, %xmm6 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE2-NEXT: pcmpeqb %xmm8, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; SSE2-NEXT: pmulhw %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE2-NEXT: pxor %xmm5, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmulhw %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: psrlw $8, %xmm9 -; SSE2-NEXT: packuswb %xmm1, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pmulhw %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: psrlw $8, %xmm8 +; SSE2-NEXT: packuswb %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm5 ; SSE2-NEXT: packuswb %xmm10, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE2-NEXT: pcmpeqb %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] ; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSE2-NEXT: pmulhw %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: pmulhw %xmm8, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pmulhw %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: psrlw $8, %xmm9 -; SSE2-NEXT: packuswb %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm10, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm8 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm8 +; SSE2-NEXT: pmulhw %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: psrlw $8, %xmm11 +; SSE2-NEXT: packuswb %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm9, %xmm8 +; SSE2-NEXT: packuswb %xmm10, %xmm8 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm8, %xmm4 +; SSE2-NEXT: pcmpeqb %xmm11, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm7, 48(%rsi) ; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: movdqa %xmm6, 32(%rsi) ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: movdqa %xmm5, 16(%rsi) ; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm4, (%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm8, (%rsi) +; SSE2-NEXT: movdqa %xmm3, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm3, 192(%rdi) @@ -1995,20 +1997,20 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm1, 64(%rdi) -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movdqa %xmm8, (%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 224(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm8 ; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, 240(%rdi) +; SSE2-NEXT: movdqa %xmm8, 224(%rdi) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 240(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] @@ -2066,103 +2068,103 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: pxor %xmm8, %xmm8 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm9, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm8 ; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] +; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: movdqa %xmm10, %xmm8 +; SSSE3-NEXT: psrlw $8, %xmm8 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; SSSE3-NEXT: pxor %xmm7, %xmm7 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSSE3-NEXT: pmulhw %xmm10, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: psrlw $8, %xmm10 -; SSSE3-NEXT: packuswb %xmm8, %xmm10 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm8, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm7 -; SSSE3-NEXT: packuswb %xmm9, %xmm7 +; SSSE3-NEXT: pmulhw %xmm9, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, %xmm11 +; SSSE3-NEXT: psrlw $8, %xmm11 +; SSSE3-NEXT: packuswb %xmm8, %xmm11 +; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm9, %xmm7 +; SSSE3-NEXT: packuswb %xmm10, %xmm7 ; SSSE3-NEXT: pxor %xmm3, %xmm3 ; SSSE3-NEXT: pcmpgtb %xmm7, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSSE3-NEXT: pcmpeqb %xmm11, %xmm3 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSSE3-NEXT: pmulhw %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pmulhw %xmm9, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: psrlw $8, %xmm9 -; SSSE3-NEXT: packuswb %xmm2, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pmulhw %xmm8, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: psrlw $8, %xmm8 +; SSSE3-NEXT: packuswb %xmm2, %xmm8 +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm9, %xmm6 ; SSSE3-NEXT: packuswb %xmm10, %xmm6 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pcmpgtb %xmm6, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSSE3-NEXT: pcmpeqb %xmm8, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; SSSE3-NEXT: pmulhw %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: pmulhw %xmm9, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSSE3-NEXT: psrlw $8, %xmm9 -; SSSE3-NEXT: packuswb %xmm1, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pmulhw %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: psrlw $8, %xmm8 +; SSSE3-NEXT: packuswb %xmm1, %xmm8 +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm9, %xmm5 ; SSSE3-NEXT: packuswb %xmm10, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm1 ; SSSE3-NEXT: pcmpgtb %xmm5, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSSE3-NEXT: pcmpeqb %xmm8, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSSE3-NEXT: pmulhw %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: pmulhw %xmm8, %xmm10 +; SSSE3-NEXT: pxor %xmm11, %xmm11 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pmulhw %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: psrlw $8, %xmm9 -; SSSE3-NEXT: packuswb %xmm0, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: packuswb %xmm10, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtb %xmm4, %xmm8 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm8 +; SSSE3-NEXT: pmulhw %xmm11, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm11 +; SSSE3-NEXT: psrlw $8, %xmm11 +; SSSE3-NEXT: packuswb %xmm0, %xmm11 +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm9, %xmm8 +; SSSE3-NEXT: packuswb %xmm10, %xmm8 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pcmpgtb %xmm8, %xmm4 +; SSSE3-NEXT: pcmpeqb %xmm11, %xmm4 ; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm0, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm7, 48(%rsi) ; SSSE3-NEXT: movdqa %xmm1, %xmm7 ; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) ; SSSE3-NEXT: movdqa %xmm2, %xmm6 ; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, (%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm8, (%rsi) +; SSSE3-NEXT: movdqa %xmm3, %xmm8 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) @@ -2174,20 +2176,20 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm8, %xmm1 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm8, (%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm8 ; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, 240(%rdi) +; SSSE3-NEXT: movdqa %xmm8, 224(%rdi) +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] @@ -2415,96 +2417,96 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm6 +; AVX1-NEXT: vpmulhw %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm8 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX1-NEXT: vpmulhw %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm7 -; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm8 -; AVX1-NEXT: vpcmpeqb %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-NEXT: vpackuswb %xmm8, %xmm7, %xmm8 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm6, %xmm8, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX1-NEXT: vpmulhw %xmm8, %xmm9, %xmm8 -; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm9 +; AVX1-NEXT: vpmulhw %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm9 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm3 ; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm8 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm8 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm8, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] ; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9 ; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX1-NEXT: vpmulhw %xmm3, %xmm10, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm10 +; AVX1-NEXT: vpmulhw %xmm6, %xmm10, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm10 ; AVX1-NEXT: vpackuswb %xmm11, %xmm10, %xmm10 -; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9 -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm9 -; AVX1-NEXT: vpcmpeqb %xmm10, %xmm9, %xmm10 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm9 +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpackuswb %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm5, %xmm9 +; AVX1-NEXT: vpcmpeqb %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX1-NEXT: vpmulhw %xmm9, %xmm11, %xmm9 -; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm11 +; AVX1-NEXT: vpmulhw %xmm10, %xmm11, %xmm10 +; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm11 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 ; AVX1-NEXT: vpackuswb %xmm11, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm9 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm9, %xmm0, %xmm9 -; AVX1-NEXT: vpcmpgtb %xmm9, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm10 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm10, %xmm0, %xmm7 +; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm11, %xmm11, %xmm11 -; AVX1-NEXT: vpxor %xmm7, %xmm11, %xmm6 -; AVX1-NEXT: vpxor %xmm11, %xmm8, %xmm5 -; AVX1-NEXT: vpxor %xmm11, %xmm10, %xmm2 -; AVX1-NEXT: vpxor %xmm0, %xmm11, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm10, %xmm10, %xmm10 +; AVX1-NEXT: vpxor %xmm10, %xmm8, %xmm5 +; AVX1-NEXT: vpxor %xmm3, %xmm10, %xmm3 +; AVX1-NEXT: vpxor %xmm10, %xmm9, %xmm2 +; AVX1-NEXT: vpxor %xmm0, %xmm10, %xmm0 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rsi) ; AVX1-NEXT: vmovdqa %xmm1, 32(%rsi) -; AVX1-NEXT: vmovdqa %xmm3, 16(%rsi) -; AVX1-NEXT: vmovdqa %xmm9, (%rsi) -; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 192(%rdi) +; AVX1-NEXT: vmovdqa %xmm6, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm7, (%rsi) ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 192(%rdi) +; AVX1-NEXT: vpmovsxbd %xmm3, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 128(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm2, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 240(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 176(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 144(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] @@ -3302,18 +3304,18 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rsi, %r11 ; SSE2-NEXT: movq %rdi, %r10 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE2-NEXT: movq %r11, %rdx +; SSE2-NEXT: movq %rsi, %rdx ; SSE2-NEXT: sarq $63, %rdx ; SSE2-NEXT: movq %r9, %rbx ; SSE2-NEXT: imulq %rdx, %rbx ; SSE2-NEXT: movq %r15, %rax ; SSE2-NEXT: mulq %rdx -; SSE2-NEXT: movq %rdx, %rdi +; SSE2-NEXT: movq %rdx, %rsi ; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: addq %rax, %rdi -; SSE2-NEXT: addq %rbx, %rdi +; SSE2-NEXT: addq %rax, %rsi +; SSE2-NEXT: addq %rbx, %rsi ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movq %rax, %r13 @@ -3324,11 +3326,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: addq %r13, %rbx ; SSE2-NEXT: addq %rax, %rbx ; SSE2-NEXT: addq %r12, %r14 -; SSE2-NEXT: adcq %rdi, %rbx +; SSE2-NEXT: adcq %rsi, %rbx ; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: movq %rdx, %r12 -; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: movq %rdx, %r15 @@ -3359,63 +3361,63 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: setne %r15b ; SSE2-NEXT: movq %rcx, %rdx ; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movq %rbp, %r11 -; SSE2-NEXT: imulq %rdx, %r11 -; SSE2-NEXT: movq %rsi, %rax +; SSE2-NEXT: movq %rbp, %r10 +; SSE2-NEXT: imulq %rdx, %r10 +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %rdx ; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rax, %rbx ; SSE2-NEXT: addq %rax, %r9 -; SSE2-NEXT: addq %r11, %r9 +; SSE2-NEXT: addq %r10, %r9 ; SSE2-NEXT: movq %rbp, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movq %rax, %r14 ; SSE2-NEXT: imulq %rcx, %r14 ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %rdx, %rbx -; SSE2-NEXT: addq %r14, %rbx -; SSE2-NEXT: addq %rax, %rbx -; SSE2-NEXT: addq %r10, %r11 -; SSE2-NEXT: adcq %r9, %rbx +; SSE2-NEXT: movq %rdx, %r10 +; SSE2-NEXT: addq %r14, %r10 +; SSE2-NEXT: addq %rax, %r10 +; SSE2-NEXT: addq %rbx, %r11 +; SSE2-NEXT: adcq %r9, %r10 ; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: movq %rax, %r9 ; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rsi -; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rdx, %rdi ; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: addq %r9, %r14 -; SSE2-NEXT: adcq $0, %rsi +; SSE2-NEXT: addq %rbx, %r14 +; SSE2-NEXT: adcq $0, %rdi ; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: movq %rdx, %r8 -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: addq %r14, %r9 -; SSE2-NEXT: adcq %rsi, %r8 +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: addq %r14, %rbx +; SSE2-NEXT: adcq %rdi, %r8 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %esi +; SSE2-NEXT: movzbl %al, %edi ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: addq %r8, %rax -; SSE2-NEXT: adcq %rsi, %rdx +; SSE2-NEXT: adcq %rdi, %rdx ; SSE2-NEXT: addq %r11, %rax -; SSE2-NEXT: adcq %rbx, %rdx -; SSE2-NEXT: movq %r9, 24(%r12) -; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: xorq %r9, %rdx -; SSE2-NEXT: xorq %rax, %r9 +; SSE2-NEXT: adcq %r10, %rdx +; SSE2-NEXT: movq %rbx, 24(%r12) +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: xorq %rbx, %rdx +; SSE2-NEXT: xorq %rax, %rbx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %r9 +; SSE2-NEXT: orq %rdx, %rbx ; SSE2-NEXT: setne %al ; SSE2-NEXT: negl %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: negl %r15d ; SSE2-NEXT: movd %r15d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %r10, 16(%r12) -; SSE2-NEXT: movq %rdi, (%r12) +; SSE2-NEXT: movq %r9, 16(%r12) +; SSE2-NEXT: movq %rsi, (%r12) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -3436,18 +3438,18 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rsi, %r11 ; SSSE3-NEXT: movq %rdi, %r10 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSSE3-NEXT: movq %r11, %rdx +; SSSE3-NEXT: movq %rsi, %rdx ; SSSE3-NEXT: sarq $63, %rdx ; SSSE3-NEXT: movq %r9, %rbx ; SSSE3-NEXT: imulq %rdx, %rbx ; SSSE3-NEXT: movq %r15, %rax ; SSSE3-NEXT: mulq %rdx -; SSSE3-NEXT: movq %rdx, %rdi +; SSSE3-NEXT: movq %rdx, %rsi ; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: addq %rax, %rdi -; SSSE3-NEXT: addq %rbx, %rdi +; SSSE3-NEXT: addq %rax, %rsi +; SSSE3-NEXT: addq %rbx, %rsi ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movq %rax, %r13 @@ -3458,11 +3460,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: addq %r13, %rbx ; SSSE3-NEXT: addq %rax, %rbx ; SSSE3-NEXT: addq %r12, %r14 -; SSSE3-NEXT: adcq %rdi, %rbx +; SSSE3-NEXT: adcq %rsi, %rbx ; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: movq %rdx, %r12 -; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: movq %rdx, %r15 @@ -3493,63 +3495,63 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: setne %r15b ; SSSE3-NEXT: movq %rcx, %rdx ; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movq %rbp, %r11 -; SSSE3-NEXT: imulq %rdx, %r11 -; SSSE3-NEXT: movq %rsi, %rax +; SSSE3-NEXT: movq %rbp, %r10 +; SSSE3-NEXT: imulq %rdx, %r10 +; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %rdx ; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %rax, %rbx ; SSSE3-NEXT: addq %rax, %r9 -; SSSE3-NEXT: addq %r11, %r9 +; SSSE3-NEXT: addq %r10, %r9 ; SSSE3-NEXT: movq %rbp, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movq %rax, %r14 ; SSSE3-NEXT: imulq %rcx, %r14 ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %rdx, %rbx -; SSSE3-NEXT: addq %r14, %rbx -; SSSE3-NEXT: addq %rax, %rbx -; SSSE3-NEXT: addq %r10, %r11 -; SSSE3-NEXT: adcq %r9, %rbx +; SSSE3-NEXT: movq %rdx, %r10 +; SSSE3-NEXT: addq %r14, %r10 +; SSSE3-NEXT: addq %rax, %r10 +; SSSE3-NEXT: addq %rbx, %r11 +; SSSE3-NEXT: adcq %r9, %r10 ; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: mulq %rsi -; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: movq %rax, %r9 ; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: mulq %rsi -; SSSE3-NEXT: movq %rdx, %rsi +; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rdx, %rdi ; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: addq %r9, %r14 -; SSSE3-NEXT: adcq $0, %rsi +; SSSE3-NEXT: addq %rbx, %r14 +; SSSE3-NEXT: adcq $0, %rdi ; SSSE3-NEXT: movq %r8, %rax ; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: movq %rdx, %r8 -; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: addq %r14, %r9 -; SSSE3-NEXT: adcq %rsi, %r8 +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: addq %r14, %rbx +; SSSE3-NEXT: adcq %rdi, %r8 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %esi +; SSSE3-NEXT: movzbl %al, %edi ; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: addq %r8, %rax -; SSSE3-NEXT: adcq %rsi, %rdx +; SSSE3-NEXT: adcq %rdi, %rdx ; SSSE3-NEXT: addq %r11, %rax -; SSSE3-NEXT: adcq %rbx, %rdx -; SSSE3-NEXT: movq %r9, 24(%r12) -; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: xorq %r9, %rdx -; SSSE3-NEXT: xorq %rax, %r9 +; SSSE3-NEXT: adcq %r10, %rdx +; SSSE3-NEXT: movq %rbx, 24(%r12) +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: xorq %rbx, %rdx +; SSSE3-NEXT: xorq %rax, %rbx ; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: orq %rdx, %r9 +; SSSE3-NEXT: orq %rdx, %rbx ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: negl %eax ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: negl %r15d ; SSSE3-NEXT: movd %r15d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %r10, 16(%r12) -; SSSE3-NEXT: movq %rdi, (%r12) +; SSSE3-NEXT: movq %r9, 16(%r12) +; SSSE3-NEXT: movq %rsi, (%r12) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -3570,18 +3572,18 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rsi, %r11 ; SSE41-NEXT: movq %rdi, %r10 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE41-NEXT: movq %r11, %rdx +; SSE41-NEXT: movq %rsi, %rdx ; SSE41-NEXT: sarq $63, %rdx ; SSE41-NEXT: movq %r9, %rbx ; SSE41-NEXT: imulq %rdx, %rbx ; SSE41-NEXT: movq %r15, %rax ; SSE41-NEXT: mulq %rdx -; SSE41-NEXT: movq %rdx, %rdi +; SSE41-NEXT: movq %rdx, %rsi ; SSE41-NEXT: movq %rax, %r12 -; SSE41-NEXT: addq %rax, %rdi -; SSE41-NEXT: addq %rbx, %rdi +; SSE41-NEXT: addq %rax, %rsi +; SSE41-NEXT: addq %rbx, %rsi ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: sarq $63, %rax ; SSE41-NEXT: movq %rax, %r13 @@ -3592,11 +3594,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: addq %r13, %rbx ; SSE41-NEXT: addq %rax, %rbx ; SSE41-NEXT: addq %r12, %r14 -; SSE41-NEXT: adcq %rdi, %rbx +; SSE41-NEXT: adcq %rsi, %rbx ; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: movq %rdx, %r12 -; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: movq %rax, %rsi ; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: movq %rdx, %r15 @@ -3627,62 +3629,62 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: setne %r15b ; SSE41-NEXT: movq %rcx, %rdx ; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movq %rbp, %r11 -; SSE41-NEXT: imulq %rdx, %r11 -; SSE41-NEXT: movq %rsi, %rax +; SSE41-NEXT: movq %rbp, %r10 +; SSE41-NEXT: imulq %rdx, %r10 +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %rdx ; SSE41-NEXT: movq %rdx, %r9 -; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: movq %rax, %rbx ; SSE41-NEXT: addq %rax, %r9 -; SSE41-NEXT: addq %r11, %r9 +; SSE41-NEXT: addq %r10, %r9 ; SSE41-NEXT: movq %rbp, %rax ; SSE41-NEXT: sarq $63, %rax ; SSE41-NEXT: movq %rax, %r14 ; SSE41-NEXT: imulq %rcx, %r14 ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %r11 -; SSE41-NEXT: movq %rdx, %rbx -; SSE41-NEXT: addq %r14, %rbx -; SSE41-NEXT: addq %rax, %rbx -; SSE41-NEXT: addq %r10, %r11 -; SSE41-NEXT: adcq %r9, %rbx +; SSE41-NEXT: movq %rdx, %r10 +; SSE41-NEXT: addq %r14, %r10 +; SSE41-NEXT: addq %rax, %r10 +; SSE41-NEXT: addq %rbx, %r11 +; SSE41-NEXT: adcq %r9, %r10 ; SSE41-NEXT: movq %r8, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: movq %rdx, %r9 -; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: movq %rax, %r9 ; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rsi -; SSE41-NEXT: movq %rdx, %rsi +; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rdx, %rdi ; SSE41-NEXT: movq %rax, %r14 -; SSE41-NEXT: addq %r9, %r14 -; SSE41-NEXT: adcq $0, %rsi +; SSE41-NEXT: addq %rbx, %r14 +; SSE41-NEXT: adcq $0, %rdi ; SSE41-NEXT: movq %r8, %rax ; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: movq %rdx, %r8 -; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: addq %r14, %r9 -; SSE41-NEXT: adcq %rsi, %r8 +; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: addq %r14, %rbx +; SSE41-NEXT: adcq %rdi, %r8 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %esi +; SSE41-NEXT: movzbl %al, %edi ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: addq %r8, %rax -; SSE41-NEXT: adcq %rsi, %rdx +; SSE41-NEXT: adcq %rdi, %rdx ; SSE41-NEXT: addq %r11, %rax -; SSE41-NEXT: adcq %rbx, %rdx -; SSE41-NEXT: movq %r9, 24(%r12) -; SSE41-NEXT: sarq $63, %r9 -; SSE41-NEXT: xorq %r9, %rdx -; SSE41-NEXT: xorq %rax, %r9 +; SSE41-NEXT: adcq %r10, %rdx +; SSE41-NEXT: movq %rbx, 24(%r12) +; SSE41-NEXT: sarq $63, %rbx +; SSE41-NEXT: xorq %rbx, %rdx +; SSE41-NEXT: xorq %rax, %rbx ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rdx, %r9 +; SSE41-NEXT: orq %rdx, %rbx ; SSE41-NEXT: setne %al ; SSE41-NEXT: negl %eax ; SSE41-NEXT: negl %r15d ; SSE41-NEXT: movd %r15d, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %r10, 16(%r12) -; SSE41-NEXT: movq %rdi, (%r12) +; SSE41-NEXT: movq %r9, 16(%r12) +; SSE41-NEXT: movq %rsi, (%r12) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -3703,18 +3705,18 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rsi, %r11 ; AVX-NEXT: movq %rdi, %r10 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; AVX-NEXT: movq %r11, %rdx +; AVX-NEXT: movq %rsi, %rdx ; AVX-NEXT: sarq $63, %rdx ; AVX-NEXT: movq %r9, %rbx ; AVX-NEXT: imulq %rdx, %rbx ; AVX-NEXT: movq %r15, %rax ; AVX-NEXT: mulq %rdx -; AVX-NEXT: movq %rdx, %rdi +; AVX-NEXT: movq %rdx, %rsi ; AVX-NEXT: movq %rax, %r12 -; AVX-NEXT: addq %rax, %rdi -; AVX-NEXT: addq %rbx, %rdi +; AVX-NEXT: addq %rax, %rsi +; AVX-NEXT: addq %rbx, %rsi ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: sarq $63, %rax ; AVX-NEXT: movq %rax, %r13 @@ -3725,11 +3727,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: addq %r13, %rbx ; AVX-NEXT: addq %rax, %rbx ; AVX-NEXT: addq %r12, %r14 -; AVX-NEXT: adcq %rdi, %rbx +; AVX-NEXT: adcq %rsi, %rbx ; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r15 ; AVX-NEXT: movq %rdx, %r12 -; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: movq %rax, %rsi ; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r15 ; AVX-NEXT: movq %rdx, %r15 @@ -3760,62 +3762,62 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: setne %r15b ; AVX-NEXT: movq %rcx, %rdx ; AVX-NEXT: sarq $63, %rdx -; AVX-NEXT: movq %rbp, %r11 -; AVX-NEXT: imulq %rdx, %r11 -; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: movq %rbp, %r10 +; AVX-NEXT: imulq %rdx, %r10 +; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %rdx ; AVX-NEXT: movq %rdx, %r9 -; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movq %rax, %rbx ; AVX-NEXT: addq %rax, %r9 -; AVX-NEXT: addq %r11, %r9 +; AVX-NEXT: addq %r10, %r9 ; AVX-NEXT: movq %rbp, %rax ; AVX-NEXT: sarq $63, %rax ; AVX-NEXT: movq %rax, %r14 ; AVX-NEXT: imulq %rcx, %r14 ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %r11 -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: addq %r14, %rbx -; AVX-NEXT: addq %rax, %rbx -; AVX-NEXT: addq %r10, %r11 -; AVX-NEXT: adcq %r9, %rbx +; AVX-NEXT: movq %rdx, %r10 +; AVX-NEXT: addq %r14, %r10 +; AVX-NEXT: addq %rax, %r10 +; AVX-NEXT: addq %rbx, %r11 +; AVX-NEXT: adcq %r9, %r10 ; AVX-NEXT: movq %r8, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rdx, %r9 -; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: mulq %rdi +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: movq %rax, %r9 ; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rsi -; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: mulq %rdi +; AVX-NEXT: movq %rdx, %rdi ; AVX-NEXT: movq %rax, %r14 -; AVX-NEXT: addq %r9, %r14 -; AVX-NEXT: adcq $0, %rsi +; AVX-NEXT: addq %rbx, %r14 +; AVX-NEXT: adcq $0, %rdi ; AVX-NEXT: movq %r8, %rax ; AVX-NEXT: mulq %rbp ; AVX-NEXT: movq %rdx, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: addq %r14, %r9 -; AVX-NEXT: adcq %rsi, %r8 +; AVX-NEXT: movq %rax, %rbx +; AVX-NEXT: addq %r14, %rbx +; AVX-NEXT: adcq %rdi, %r8 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %esi +; AVX-NEXT: movzbl %al, %edi ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rbp ; AVX-NEXT: addq %r8, %rax -; AVX-NEXT: adcq %rsi, %rdx +; AVX-NEXT: adcq %rdi, %rdx ; AVX-NEXT: addq %r11, %rax -; AVX-NEXT: adcq %rbx, %rdx -; AVX-NEXT: movq %r9, 24(%r12) -; AVX-NEXT: sarq $63, %r9 -; AVX-NEXT: xorq %r9, %rdx -; AVX-NEXT: xorq %rax, %r9 +; AVX-NEXT: adcq %r10, %rdx +; AVX-NEXT: movq %rbx, 24(%r12) +; AVX-NEXT: sarq $63, %rbx +; AVX-NEXT: xorq %rbx, %rdx +; AVX-NEXT: xorq %rax, %rbx ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: orq %rdx, %r9 +; AVX-NEXT: orq %rdx, %rbx ; AVX-NEXT: setne %al ; AVX-NEXT: negl %eax ; AVX-NEXT: negl %r15d ; AVX-NEXT: vmovd %r15d, %xmm0 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %r10, 16(%r12) -; AVX-NEXT: movq %rdi, (%r12) +; AVX-NEXT: movq %r9, 16(%r12) +; AVX-NEXT: movq %rsi, (%r12) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 @@ -3897,7 +3899,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: movq %r8, %rax ; AVX512F-NEXT: mulq %rdx ; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: movq %rax, %r14 ; AVX512F-NEXT: addq %rax, %r10 ; AVX512F-NEXT: addq %rsi, %r10 ; AVX512F-NEXT: movq %rbp, %rax @@ -3906,26 +3908,26 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: imulq %r9, %rsi ; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: movq %rdx, %r14 -; AVX512F-NEXT: addq %rsi, %r14 -; AVX512F-NEXT: addq %rax, %r14 -; AVX512F-NEXT: addq %r11, %rbx -; AVX512F-NEXT: adcq %r10, %r14 +; AVX512F-NEXT: movq %rdx, %r11 +; AVX512F-NEXT: addq %rsi, %r11 +; AVX512F-NEXT: addq %rax, %r11 +; AVX512F-NEXT: addq %r14, %rbx +; AVX512F-NEXT: adcq %r10, %r11 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %r8 -; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: movq %rdx, %r14 +; AVX512F-NEXT: movq %rax, %r10 ; AVX512F-NEXT: movq %r9, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r8 ; AVX512F-NEXT: movq %rax, %r15 -; AVX512F-NEXT: addq %r10, %r15 +; AVX512F-NEXT: addq %r14, %r15 ; AVX512F-NEXT: adcq $0, %r8 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %rbp ; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %r15, %r10 +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: addq %r15, %r14 ; AVX512F-NEXT: adcq %r8, %rdi ; AVX512F-NEXT: setb %al ; AVX512F-NEXT: movzbl %al, %esi @@ -3934,12 +3936,12 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: addq %rdi, %rax ; AVX512F-NEXT: adcq %rsi, %rdx ; AVX512F-NEXT: addq %rbx, %rax -; AVX512F-NEXT: adcq %r14, %rdx -; AVX512F-NEXT: movq %r10, 8(%r12) -; AVX512F-NEXT: sarq $63, %r10 -; AVX512F-NEXT: xorq %r10, %rdx -; AVX512F-NEXT: xorq %rax, %r10 -; AVX512F-NEXT: orq %rdx, %r10 +; AVX512F-NEXT: adcq %r11, %rdx +; AVX512F-NEXT: movq %r14, 8(%r12) +; AVX512F-NEXT: sarq $63, %r14 +; AVX512F-NEXT: xorq %r14, %rdx +; AVX512F-NEXT: xorq %rax, %r14 +; AVX512F-NEXT: orq %rdx, %r14 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k1 @@ -3948,7 +3950,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512F-NEXT: movq %rcx, 16(%r12) -; AVX512F-NEXT: movq %r11, (%r12) +; AVX512F-NEXT: movq %r10, (%r12) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -4030,7 +4032,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: movq %r8, %rax ; AVX512BW-NEXT: mulq %rdx ; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: movq %rax, %r14 ; AVX512BW-NEXT: addq %rax, %r10 ; AVX512BW-NEXT: addq %rsi, %r10 ; AVX512BW-NEXT: movq %rbp, %rax @@ -4039,26 +4041,26 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: imulq %r9, %rsi ; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: movq %rdx, %r14 -; AVX512BW-NEXT: addq %rsi, %r14 -; AVX512BW-NEXT: addq %rax, %r14 -; AVX512BW-NEXT: addq %r11, %rbx -; AVX512BW-NEXT: adcq %r10, %r14 +; AVX512BW-NEXT: movq %rdx, %r11 +; AVX512BW-NEXT: addq %rsi, %r11 +; AVX512BW-NEXT: addq %rax, %r11 +; AVX512BW-NEXT: addq %r14, %rbx +; AVX512BW-NEXT: adcq %r10, %r11 ; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %r8 -; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: movq %rdx, %r14 +; AVX512BW-NEXT: movq %rax, %r10 ; AVX512BW-NEXT: movq %r9, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r8 ; AVX512BW-NEXT: movq %rax, %r15 -; AVX512BW-NEXT: addq %r10, %r15 +; AVX512BW-NEXT: addq %r14, %r15 ; AVX512BW-NEXT: adcq $0, %r8 ; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %rbp ; AVX512BW-NEXT: movq %rdx, %rdi -; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %r15, %r10 +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: addq %r15, %r14 ; AVX512BW-NEXT: adcq %r8, %rdi ; AVX512BW-NEXT: setb %al ; AVX512BW-NEXT: movzbl %al, %esi @@ -4067,12 +4069,12 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: addq %rdi, %rax ; AVX512BW-NEXT: adcq %rsi, %rdx ; AVX512BW-NEXT: addq %rbx, %rax -; AVX512BW-NEXT: adcq %r14, %rdx -; AVX512BW-NEXT: movq %r10, 8(%r12) -; AVX512BW-NEXT: sarq $63, %r10 -; AVX512BW-NEXT: xorq %r10, %rdx -; AVX512BW-NEXT: xorq %rax, %r10 -; AVX512BW-NEXT: orq %rdx, %r10 +; AVX512BW-NEXT: adcq %r11, %rdx +; AVX512BW-NEXT: movq %r14, 8(%r12) +; AVX512BW-NEXT: sarq $63, %r14 +; AVX512BW-NEXT: xorq %r14, %rdx +; AVX512BW-NEXT: xorq %rax, %r14 +; AVX512BW-NEXT: orq %rdx, %r14 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: andl $1, %eax ; AVX512BW-NEXT: kmovw %eax, %k1 @@ -4081,7 +4083,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512BW-NEXT: movq %rcx, 16(%r12) -; AVX512BW-NEXT: movq %r11, (%r12) +; AVX512BW-NEXT: movq %r10, (%r12) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 657e975a69440..653c3a9969151 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -222,19 +222,19 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -242,38 +242,38 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, (%rcx) -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: movq %xmm3, 16(%rcx) ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: uaddo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -281,19 +281,19 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: paddd %xmm0, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, (%rcx) -; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, (%rcx) ; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSSE3-NEXT: paddd %xmm2, %xmm3 ; SSSE3-NEXT: movq %xmm3, 16(%rcx) ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: pxor %xmm4, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSSE3-NEXT: movq %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: uaddo_v6i32: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index b919bdb40c5f1..e929499c92cbd 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1621,85 +1621,85 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-LABEL: umulo_v64i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm8, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm8, %xmm11 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pand %xmm8, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packuswb %xmm11, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm12 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm12, %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pand %xmm8, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-NEXT: packuswb %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE2-NEXT: movdqa %xmm1, %xmm12 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm9, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm5, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: packuswb %xmm12, %xmm5 -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSE2-NEXT: movdqa %xmm2, %xmm12 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm13, %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm13 -; SSE2-NEXT: pand %xmm8, %xmm13 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE2-NEXT: packuswb %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSE2-NEXT: movdqa %xmm2, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm9, %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm14 +; SSE2-NEXT: pand %xmm8, %xmm14 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: movdqa %xmm7, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: packuswb %xmm14, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] ; SSE2-NEXT: movdqa %xmm3, %xmm14 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] -; SSE2-NEXT: pmullw %xmm13, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; SSE2-NEXT: pmullw %xmm6, %xmm14 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE2-NEXT: pmullw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: movdqa %xmm14, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm7, %xmm8 +; SSE2-NEXT: packuswb %xmm6, %xmm8 ; SSE2-NEXT: psrlw $8, %xmm14 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: packuswb %xmm14, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm12 +; SSE2-NEXT: psrlw $8, %xmm13 ; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm12, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm11 +; SSE2-NEXT: packuswb %xmm13, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm12 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm11, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm10 +; SSE2-NEXT: packuswb %xmm12, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm11 ; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm10, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm9, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm3 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm2 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm10, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: movdqa %xmm8, 48(%rsi) -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm6, 32(%rsi) -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm9, 32(%rsi) +; SSE2-NEXT: movdqa %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm5, 16(%rsi) ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: movdqa %xmm4, (%rsi) @@ -1745,11 +1745,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 176(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: movdqa %xmm6, 144(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm8 +; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: movdqa %xmm8, 144(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 @@ -1760,11 +1760,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 112(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, 80(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: movdqa %xmm7, 80(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm1 @@ -1774,95 +1774,95 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, 48(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: movdqa %xmm7, 16(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: movdqa %xmm6, 16(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v64i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: movdqa %xmm4, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm0, %xmm10 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm8, %xmm10 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSSE3-NEXT: movdqa %xmm0, %xmm11 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm8, %xmm11 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: movdqa %xmm10, %xmm11 -; SSSE3-NEXT: pand %xmm8, %xmm11 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSSE3-NEXT: movdqa %xmm11, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: packuswb %xmm11, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm12 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm1, %xmm11 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm12, %xmm11 -; SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSSE3-NEXT: pand %xmm8, %xmm12 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSSE3-NEXT: packuswb %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSSE3-NEXT: movdqa %xmm1, %xmm12 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm9, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm5, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: packuswb %xmm12, %xmm5 -; SSSE3-NEXT: movdqa %xmm6, %xmm13 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSSE3-NEXT: movdqa %xmm2, %xmm12 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm13, %xmm12 -; SSSE3-NEXT: movdqa %xmm12, %xmm13 -; SSSE3-NEXT: pand %xmm8, %xmm13 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSSE3-NEXT: packuswb %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; SSSE3-NEXT: movdqa %xmm2, %xmm13 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm9, %xmm13 +; SSSE3-NEXT: movdqa %xmm13, %xmm14 +; SSSE3-NEXT: pand %xmm8, %xmm14 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: packuswb %xmm13, %xmm6 -; SSSE3-NEXT: movdqa %xmm7, %xmm13 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSSE3-NEXT: movdqa %xmm2, %xmm9 +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: packuswb %xmm14, %xmm9 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] ; SSSE3-NEXT: movdqa %xmm3, %xmm14 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] -; SSSE3-NEXT: pmullw %xmm13, %xmm14 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; SSSE3-NEXT: pmullw %xmm6, %xmm14 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm7, %xmm3 -; SSSE3-NEXT: movdqa %xmm14, %xmm7 -; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: movdqa %xmm14, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm6 ; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: packuswb %xmm7, %xmm8 +; SSSE3-NEXT: packuswb %xmm6, %xmm8 ; SSSE3-NEXT: psrlw $8, %xmm14 ; SSSE3-NEXT: psrlw $8, %xmm3 ; SSSE3-NEXT: packuswb %xmm14, %xmm3 -; SSSE3-NEXT: psrlw $8, %xmm12 +; SSSE3-NEXT: psrlw $8, %xmm13 ; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: packuswb %xmm12, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm11 +; SSSE3-NEXT: packuswb %xmm13, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm12 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: packuswb %xmm11, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm10 +; SSSE3-NEXT: packuswb %xmm12, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm11 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: packuswb %xmm10, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm9, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 -; SSSE3-NEXT: pxor %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm1 -; SSSE3-NEXT: pxor %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: packuswb %xmm11, %xmm0 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm6 ; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) -; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm9, 32(%rsi) +; SSSE3-NEXT: movdqa %xmm2, %xmm8 ; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, (%rsi) @@ -1908,11 +1908,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 176(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, 144(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm8 +; SSSE3-NEXT: psrad $31, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, 144(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm2 @@ -1923,11 +1923,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 112(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm8 -; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, 80(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm7 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm1 @@ -1937,11 +1937,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, 16(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, 16(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v64i8: @@ -2094,32 +2094,32 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm8 -; AVX1-NEXT: vpackuswb %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm4 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm9 +; AVX1-NEXT: vpackuswb %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX1-NEXT: vpmullw %xmm8, %xmm10, %xmm8 -; AVX1-NEXT: vpand %xmm9, %xmm8, %xmm11 +; AVX1-NEXT: vpmullw %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vpand %xmm7, %xmm9, %xmm11 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm10 -; AVX1-NEXT: vpand %xmm9, %xmm10, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm0 ; AVX1-NEXT: vpackuswb %xmm11, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; AVX1-NEXT: vpmullw %xmm2, %xmm11, %xmm11 -; AVX1-NEXT: vpand %xmm9, %xmm11, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm11, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm12, %xmm13, %xmm12 -; AVX1-NEXT: vpand %xmm9, %xmm12, %xmm13 +; AVX1-NEXT: vpand %xmm7, %xmm12, %xmm13 ; AVX1-NEXT: vpackuswb %xmm2, %xmm13, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] @@ -2128,77 +2128,77 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpmullw %xmm13, %xmm14, %xmm13 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm9, %xmm13, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm9, %xmm9 -; AVX1-NEXT: vpackuswb %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm9 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm9 +; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm13, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm7 +; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm11 -; AVX1-NEXT: vpackuswb %xmm9, %xmm11, %xmm9 -; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX1-NEXT: vpackuswb %xmm3, %xmm11, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm9 ; AVX1-NEXT: vpsrlw $8, %xmm10, %xmm10 -; AVX1-NEXT: vpackuswb %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpackuswb %xmm9, %xmm10, %xmm9 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 -; AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX1-NEXT: vpackuswb %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm9 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm10 -; AVX1-NEXT: vpcmpeqd %xmm11, %xmm11, %xmm11 -; AVX1-NEXT: vpxor %xmm3, %xmm11, %xmm7 -; AVX1-NEXT: vpxor %xmm11, %xmm9, %xmm6 -; AVX1-NEXT: vpxor %xmm11, %xmm8, %xmm5 -; AVX1-NEXT: vpxor %xmm11, %xmm10, %xmm3 -; AVX1-NEXT: vmovdqa %xmm1, 48(%rsi) +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm9 +; AVX1-NEXT: vpcmpeqd %xmm10, %xmm10, %xmm10 +; AVX1-NEXT: vpxor %xmm1, %xmm10, %xmm6 +; AVX1-NEXT: vpxor %xmm3, %xmm10, %xmm5 +; AVX1-NEXT: vpxor %xmm10, %xmm8, %xmm3 +; AVX1-NEXT: vpxor %xmm10, %xmm9, %xmm1 +; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi) ; AVX1-NEXT: vmovdqa %xmm2, 32(%rsi) ; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) ; AVX1-NEXT: vmovdqa %xmm4, (%rsi) -; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) +; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) +; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 112(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) ; AVX1-NEXT: vzeroupper @@ -2907,14 +2907,14 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: pushq %r14 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %r9, %r10 -; SSE2-NEXT: movq %rcx, %r11 +; SSE2-NEXT: movq %r9, %r11 +; SSE2-NEXT: movq %rcx, %r10 ; SSE2-NEXT: movq %rdx, %rcx ; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSE2-NEXT: testq %r10, %r10 +; SSE2-NEXT: testq %r11, %r11 ; SSE2-NEXT: setne %dl ; SSE2-NEXT: testq %rsi, %rsi ; SSE2-NEXT: setne %bpl @@ -2922,32 +2922,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: seto %r15b -; SSE2-NEXT: movq %r10, %rax +; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %rdi ; SSE2-NEXT: seto %r12b ; SSE2-NEXT: orb %r15b, %r12b ; SSE2-NEXT: orb %bpl, %r12b -; SSE2-NEXT: leaq (%rsi,%rax), %r10 +; SSE2-NEXT: leaq (%rsi,%rax), %r11 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: addq %r10, %rsi -; SSE2-NEXT: setb %r10b -; SSE2-NEXT: orb %r12b, %r10b +; SSE2-NEXT: addq %r11, %rsi +; SSE2-NEXT: setb %r11b +; SSE2-NEXT: orb %r12b, %r11b ; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %al -; SSE2-NEXT: testq %r11, %r11 +; SSE2-NEXT: testq %r10, %r10 ; SSE2-NEXT: setne %bpl ; SSE2-NEXT: andb %al, %bpl -; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r14 ; SSE2-NEXT: movq %rax, %r8 -; SSE2-NEXT: seto %r11b +; SSE2-NEXT: seto %r10b ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: mulq %rcx ; SSE2-NEXT: seto %r9b -; SSE2-NEXT: orb %r11b, %r9b +; SSE2-NEXT: orb %r10b, %r9b ; SSE2-NEXT: orb %bpl, %r9b ; SSE2-NEXT: addq %rax, %r8 ; SSE2-NEXT: movq %rcx, %rax @@ -2958,7 +2958,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movzbl %r10b, %ecx +; SSE2-NEXT: movzbl %r11b, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2980,14 +2980,14 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: pushq %r14 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %r9, %r10 -; SSSE3-NEXT: movq %rcx, %r11 +; SSSE3-NEXT: movq %r9, %r11 +; SSSE3-NEXT: movq %rcx, %r10 ; SSSE3-NEXT: movq %rdx, %rcx ; SSSE3-NEXT: movq %rsi, %rax ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSSE3-NEXT: testq %r10, %r10 +; SSSE3-NEXT: testq %r11, %r11 ; SSSE3-NEXT: setne %dl ; SSSE3-NEXT: testq %rsi, %rsi ; SSSE3-NEXT: setne %bpl @@ -2995,32 +2995,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: seto %r15b -; SSSE3-NEXT: movq %r10, %rax +; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %rdi ; SSSE3-NEXT: seto %r12b ; SSSE3-NEXT: orb %r15b, %r12b ; SSSE3-NEXT: orb %bpl, %r12b -; SSSE3-NEXT: leaq (%rsi,%rax), %r10 +; SSSE3-NEXT: leaq (%rsi,%rax), %r11 ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rax, %rdi ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: addq %r10, %rsi -; SSSE3-NEXT: setb %r10b -; SSSE3-NEXT: orb %r12b, %r10b +; SSSE3-NEXT: addq %r11, %rsi +; SSSE3-NEXT: setb %r11b +; SSSE3-NEXT: orb %r12b, %r11b ; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %al -; SSSE3-NEXT: testq %r11, %r11 +; SSSE3-NEXT: testq %r10, %r10 ; SSSE3-NEXT: setne %bpl ; SSSE3-NEXT: andb %al, %bpl -; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r14 ; SSSE3-NEXT: movq %rax, %r8 -; SSSE3-NEXT: seto %r11b +; SSSE3-NEXT: seto %r10b ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: mulq %rcx ; SSSE3-NEXT: seto %r9b -; SSSE3-NEXT: orb %r11b, %r9b +; SSSE3-NEXT: orb %r10b, %r9b ; SSSE3-NEXT: orb %bpl, %r9b ; SSSE3-NEXT: addq %rax, %r8 ; SSSE3-NEXT: movq %rcx, %rax @@ -3031,7 +3031,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movzbl %r10b, %ecx +; SSSE3-NEXT: movzbl %r11b, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -3053,14 +3053,14 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: pushq %r14 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %r9, %r10 -; SSE41-NEXT: movq %rcx, %r11 +; SSE41-NEXT: movq %r9, %r11 +; SSE41-NEXT: movq %rcx, %r10 ; SSE41-NEXT: movq %rdx, %rcx ; SSE41-NEXT: movq %rsi, %rax ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; SSE41-NEXT: testq %r10, %r10 +; SSE41-NEXT: testq %r11, %r11 ; SSE41-NEXT: setne %dl ; SSE41-NEXT: testq %rsi, %rsi ; SSE41-NEXT: setne %bpl @@ -3068,32 +3068,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rsi ; SSE41-NEXT: seto %r15b -; SSE41-NEXT: movq %r10, %rax +; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %rdi ; SSE41-NEXT: seto %r12b ; SSE41-NEXT: orb %r15b, %r12b ; SSE41-NEXT: orb %bpl, %r12b -; SSE41-NEXT: leaq (%rsi,%rax), %r10 +; SSE41-NEXT: leaq (%rsi,%rax), %r11 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rax, %rdi ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: addq %r10, %rsi -; SSE41-NEXT: setb %r10b -; SSE41-NEXT: orb %r12b, %r10b +; SSE41-NEXT: addq %r11, %rsi +; SSE41-NEXT: setb %r11b +; SSE41-NEXT: orb %r12b, %r11b ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al -; SSE41-NEXT: testq %r11, %r11 +; SSE41-NEXT: testq %r10, %r10 ; SSE41-NEXT: setne %bpl ; SSE41-NEXT: andb %al, %bpl -; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r14 ; SSE41-NEXT: movq %rax, %r8 -; SSE41-NEXT: seto %r11b +; SSE41-NEXT: seto %r10b ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: mulq %rcx ; SSE41-NEXT: seto %r9b -; SSE41-NEXT: orb %r11b, %r9b +; SSE41-NEXT: orb %r10b, %r9b ; SSE41-NEXT: orb %bpl, %r9b ; SSE41-NEXT: addq %rax, %r8 ; SSE41-NEXT: movq %rcx, %rax @@ -3103,7 +3103,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; SSE41-NEXT: orb %r9b, %cl ; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: negl %ecx -; SSE41-NEXT: movzbl %r10b, %r8d +; SSE41-NEXT: movzbl %r11b, %r8d ; SSE41-NEXT: negl %r8d ; SSE41-NEXT: movd %r8d, %xmm0 ; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 @@ -3125,14 +3125,14 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: pushq %r14 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %r9, %r10 -; AVX-NEXT: movq %rcx, %r11 +; AVX-NEXT: movq %r9, %r11 +; AVX-NEXT: movq %rcx, %r10 ; AVX-NEXT: movq %rdx, %rcx ; AVX-NEXT: movq %rsi, %rax ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; AVX-NEXT: testq %r10, %r10 +; AVX-NEXT: testq %r11, %r11 ; AVX-NEXT: setne %dl ; AVX-NEXT: testq %rsi, %rsi ; AVX-NEXT: setne %bpl @@ -3140,32 +3140,32 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rsi ; AVX-NEXT: seto %r15b -; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %rdi ; AVX-NEXT: seto %r12b ; AVX-NEXT: orb %r15b, %r12b ; AVX-NEXT: orb %bpl, %r12b -; AVX-NEXT: leaq (%rsi,%rax), %r10 +; AVX-NEXT: leaq (%rsi,%rax), %r11 ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rax, %rdi ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: addq %r10, %rsi -; AVX-NEXT: setb %r10b -; AVX-NEXT: orb %r12b, %r10b +; AVX-NEXT: addq %r11, %rsi +; AVX-NEXT: setb %r11b +; AVX-NEXT: orb %r12b, %r11b ; AVX-NEXT: testq %r9, %r9 ; AVX-NEXT: setne %al -; AVX-NEXT: testq %r11, %r11 +; AVX-NEXT: testq %r10, %r10 ; AVX-NEXT: setne %bpl ; AVX-NEXT: andb %al, %bpl -; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r14 ; AVX-NEXT: movq %rax, %r8 -; AVX-NEXT: seto %r11b +; AVX-NEXT: seto %r10b ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: mulq %rcx ; AVX-NEXT: seto %r9b -; AVX-NEXT: orb %r11b, %r9b +; AVX-NEXT: orb %r10b, %r9b ; AVX-NEXT: orb %bpl, %r9b ; AVX-NEXT: addq %rax, %r8 ; AVX-NEXT: movq %rcx, %rax @@ -3175,7 +3175,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind ; AVX-NEXT: orb %r9b, %cl ; AVX-NEXT: movzbl %cl, %ecx ; AVX-NEXT: negl %ecx -; AVX-NEXT: movzbl %r10b, %r8d +; AVX-NEXT: movzbl %r11b, %r8d ; AVX-NEXT: negl %r8d ; AVX-NEXT: vmovd %r8d, %xmm0 ; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index df5da63b50359..a58c3dd0d5307 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -240,35 +240,35 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE2-NEXT: movd %r8d, %xmm0 ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movd %edx, %xmm2 ; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movd %r9d, %xmm1 ; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: psubd %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psubd %xmm3, %xmm0 +; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rcx) -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rdi) ; SSE2-NEXT: movdqa %xmm4, (%rdi) @@ -281,35 +281,35 @@ define <6 x i32> @usubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSSE3-NEXT: movd %r8d, %xmm0 ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movd %edx, %xmm2 ; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSSE3-NEXT: movd %r9d, %xmm1 ; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: psubd %xmm2, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: psubd %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm4, (%rcx) -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: psubd %xmm3, %xmm0 +; SSSE3-NEXT: psubd %xmm2, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rcx) -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm4, (%rdi) diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 890514fbdc022..6d71564dd57f9 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -1404,38 +1404,38 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fadd_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm8 +; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: pand %xmm9, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm9, %xmm0 -; SSE42-NEXT: movapd {{.*#+}} xmm9 = [-0.0E+0,-0.0E+0] -; SSE42-NEXT: movapd %xmm9, %xmm11 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: pand %xmm10, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 +; SSE42-NEXT: movapd {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0] +; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm7 +; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm6 +; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm10 -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE42-NEXT: addpd %xmm8, %xmm9 +; SSE42-NEXT: pand %xmm0, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE42-NEXT: addpd %xmm9, %xmm10 ; SSE42-NEXT: addpd %xmm6, %xmm1 ; SSE42-NEXT: addpd %xmm7, %xmm2 ; SSE42-NEXT: addpd %xmm11, %xmm3 -; SSE42-NEXT: movapd %xmm9, %xmm0 +; SSE42-NEXT: movapd %xmm10, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fadd_v8f64_cast_cond: @@ -1734,38 +1734,38 @@ define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fmul_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm8 +; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: pand %xmm9, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm9, %xmm0 -; SSE42-NEXT: movapd {{.*#+}} xmm9 = [1.0E+0,1.0E+0] -; SSE42-NEXT: movapd %xmm9, %xmm11 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: pand %xmm10, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 +; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1.0E+0,1.0E+0] +; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm7 +; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm6 +; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm10 -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE42-NEXT: mulpd %xmm8, %xmm9 +; SSE42-NEXT: pand %xmm0, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE42-NEXT: mulpd %xmm9, %xmm10 ; SSE42-NEXT: mulpd %xmm6, %xmm1 ; SSE42-NEXT: mulpd %xmm7, %xmm2 ; SSE42-NEXT: mulpd %xmm11, %xmm3 -; SSE42-NEXT: movapd %xmm9, %xmm0 +; SSE42-NEXT: movapd %xmm10, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fmul_v8f64_cast_cond: @@ -1922,38 +1922,38 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n ; ; SSE42-LABEL: fdiv_v8f64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movapd %xmm0, %xmm8 +; SSE42-NEXT: movapd %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm10, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 ; SSE42-NEXT: movapd {{.*#+}} xmm11 = [1.0E+0,1.0E+0] ; SSE42-NEXT: movapd %xmm11, %xmm10 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm10 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 ; SSE42-NEXT: movapd %xmm11, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 ; SSE42-NEXT: movapd %xmm11, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm9 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm9 -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: pand %xmm0, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm11 -; SSE42-NEXT: divpd %xmm11, %xmm8 +; SSE42-NEXT: divpd %xmm11, %xmm9 ; SSE42-NEXT: divpd %xmm6, %xmm1 ; SSE42-NEXT: divpd %xmm7, %xmm2 ; SSE42-NEXT: divpd %xmm10, %xmm3 -; SSE42-NEXT: movapd %xmm8, %xmm0 +; SSE42-NEXT: movapd %xmm9, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: fdiv_v8f64_cast_cond: @@ -2982,43 +2982,43 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; ; SSE42-LABEL: mul_v8i64_cast_cond: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa %xmm0, %xmm8 +; SSE42-NEXT: movdqa %xmm0, %xmm9 ; SSE42-NEXT: movd %edi, %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] -; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128] -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: pand %xmm9, %xmm0 -; SSE42-NEXT: pcmpeqq %xmm9, %xmm0 -; SSE42-NEXT: movapd {{.*#+}} xmm9 = [1,1] -; SSE42-NEXT: movapd %xmm9, %xmm11 +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] +; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128] +; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: pand %xmm10, %xmm0 +; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 +; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1,1] +; SSE42-NEXT: movapd %xmm10, %xmm11 ; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 ; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm7, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm7 +; SSE42-NEXT: movapd %xmm10, %xmm7 ; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8] -; SSE42-NEXT: movdqa %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm8, %xmm0 ; SSE42-NEXT: pand %xmm6, %xmm0 ; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 -; SSE42-NEXT: movapd %xmm9, %xmm6 +; SSE42-NEXT: movapd %xmm10, %xmm6 ; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 ; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2] -; SSE42-NEXT: pand %xmm0, %xmm10 -; SSE42-NEXT: pcmpeqq %xmm0, %xmm10 -; SSE42-NEXT: movdqa %xmm10, %xmm0 -; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9 +; SSE42-NEXT: pand %xmm0, %xmm8 +; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm8, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE42-NEXT: movdqa %xmm9, %xmm0 ; SSE42-NEXT: psrlq $32, %xmm0 -; SSE42-NEXT: pmuludq %xmm9, %xmm0 -; SSE42-NEXT: movdqa %xmm9, %xmm4 +; SSE42-NEXT: pmuludq %xmm10, %xmm0 +; SSE42-NEXT: movdqa %xmm10, %xmm4 ; SSE42-NEXT: psrlq $32, %xmm4 -; SSE42-NEXT: pmuludq %xmm8, %xmm4 +; SSE42-NEXT: pmuludq %xmm9, %xmm4 ; SSE42-NEXT: paddq %xmm0, %xmm4 ; SSE42-NEXT: psllq $32, %xmm4 -; SSE42-NEXT: pmuludq %xmm8, %xmm9 -; SSE42-NEXT: paddq %xmm4, %xmm9 +; SSE42-NEXT: pmuludq %xmm9, %xmm10 +; SSE42-NEXT: paddq %xmm4, %xmm10 ; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: psrlq $32, %xmm0 ; SSE42-NEXT: pmuludq %xmm6, %xmm0 @@ -3049,7 +3049,7 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef ; SSE42-NEXT: psllq $32, %xmm4 ; SSE42-NEXT: pmuludq %xmm11, %xmm3 ; SSE42-NEXT: paddq %xmm4, %xmm3 -; SSE42-NEXT: movdqa %xmm9, %xmm0 +; SSE42-NEXT: movdqa %xmm10, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: mul_v8i64_cast_cond: diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index 2c7f65064a5e6..f4d6b52377f57 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -1742,35 +1742,35 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind ; SSE-NEXT: packssdw %xmm4, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd %xmm1, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: cmpltpd %xmm0, %xmm1 -; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: cmpltpd %xmm0, %xmm4 +; SSE-NEXT: packssdw %xmm2, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: packssdw %xmm3, %xmm1 +; SSE-NEXT: packssdw %xmm3, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: packsswb %xmm5, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: packsswb %xmm5, %xmm1 +; SSE-NEXT: pmovmskb %xmm4, %ecx ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pmovmskb %xmm1, %ecx -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: packssdw %xmm1, %xmm3 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm2 -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: packssdw %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: packssdw %xmm2, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm3, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %edx ; SSE-NEXT: shll $16, %edx ; SSE-NEXT: orl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 818480854c9df..ed6f5007b77e3 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -54,12 +54,12 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -105,12 +105,12 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -480,12 +480,12 @@ define <4 x float> @cvt_4i16_to_4f32_constrained(<4 x i16> %a0) nounwind strictf ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -895,12 +895,12 @@ define <4 x float> @load_cvt_8i16_to_4f32(ptr %a0) nounwind { ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1318,12 +1318,12 @@ define <4 x float> @load_cvt_8i16_to_4f32_constrained(ptr %a0) nounwind strictfp ; AVX-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index 28241da6506a9..fcc68f09c1d20 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -503,56 +503,55 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movdqa 96(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pslld $16, %xmm12 +; SSE-NEXT: psrad $16, %xmm12 +; SSE-NEXT: packssdw %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm10 ; SSE-NEXT: psrad $16, %xmm10 ; SSE-NEXT: packssdw %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm9 ; SSE-NEXT: psrad $16, %xmm9 ; SSE-NEXT: packssdw %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrad $16, %xmm11 ; SSE-NEXT: packssdw %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm13 ; SSE-NEXT: psrad $16, %xmm13 ; SSE-NEXT: packssdw %xmm0, %xmm13 -; SSE-NEXT: movdqa 240(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm7 @@ -597,9 +596,9 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: psrad $16, %xmm3 ; SSE-NEXT: psrad $16, %xmm5 ; SSE-NEXT: packssdw %xmm3, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm6 ; SSE-NEXT: packssdw %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -607,9 +606,10 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm3 ; SSE-NEXT: packssdw %xmm0, %xmm3 -; SSE-NEXT: psrad $16, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm7 -; SSE-NEXT: packssdw %xmm12, %xmm7 +; SSE-NEXT: packssdw %xmm0, %xmm7 ; SSE-NEXT: psrad $16, %xmm8 ; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: packssdw %xmm8, %xmm2 @@ -620,8 +620,7 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movdqa %xmm11, 64(%rsi) ; SSE-NEXT: movdqa %xmm9, (%rsi) ; SSE-NEXT: movdqa %xmm10, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm12, 16(%rsi) ; SSE-NEXT: movdqa %xmm2, 96(%rdx) ; SSE-NEXT: movdqa %xmm7, 112(%rdx) ; SSE-NEXT: movdqa %xmm3, 64(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 1967248590bc1..2fdc04bd73455 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -454,14 +454,14 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm5, %xmm10 ; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] @@ -473,82 +473,82 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] ; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] -; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm12, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm8, %xmm12 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pandn %xmm13, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm10, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,0] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm6, 16(%rsi) ; SSE-NEXT: movaps %xmm7, (%rsi) -; SSE-NEXT: movdqa %xmm13, 16(%rdx) -; SSE-NEXT: movdqa %xmm12, (%rdx) -; SSE-NEXT: movdqa %xmm9, 16(%rcx) -; SSE-NEXT: movdqa %xmm8, (%rcx) +; SSE-NEXT: movdqa %xmm11, 16(%rdx) +; SSE-NEXT: movdqa %xmm9, (%rdx) +; SSE-NEXT: movdqa %xmm8, 16(%rcx) +; SSE-NEXT: movdqa %xmm10, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf16: @@ -709,23 +709,23 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i16_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movdqa 96(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 144(%rdi), %xmm13 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm14 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa 64(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -733,16 +733,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] +; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] @@ -750,18 +751,20 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] @@ -771,17 +774,16 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: movdqa 128(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] @@ -790,36 +792,56 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm14, %xmm6 ; SSE-NEXT: pandn %xmm13, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm0, %xmm13 @@ -831,116 +853,95 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,7,6,7] +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm9, 48(%rdx) -; SSE-NEXT: movdqa %xmm13, (%rdx) -; SSE-NEXT: movdqa %xmm7, 16(%rdx) +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movdqa %xmm5, 32(%rdx) +; SSE-NEXT: movdqa %xmm13, 48(%rdx) +; SSE-NEXT: movdqa %xmm15, (%rdx) +; SSE-NEXT: movdqa %xmm10, 16(%rdx) ; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm5, 48(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) ; SSE-NEXT: movdqa %xmm3, (%rcx) ; SSE-NEXT: movdqa %xmm4, 16(%rcx) -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf32: @@ -952,24 +953,24 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,3,2,3,4,5,6,7] @@ -979,10 +980,10 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] @@ -990,59 +991,59 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm8[2],xmm5[3,4],xmm8[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, (%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX1-ONLY-NEXT: vzeroupper @@ -1246,909 +1247,905 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i16_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 192(%rdi), %xmm11 +; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 272(%rdi), %xmm6 +; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: movdqa 256(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,1] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] +; SSE-NEXT: movdqa 208(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm1[2,0] -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] +; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa 144(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 368(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: pandn %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pandn %xmm5, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pandn (%rsp), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,2] -; SSE-NEXT: pandn %xmm13, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,2] +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm8, 32(%rdx) +; SSE-NEXT: movdqa %xmm6, 32(%rdx) ; SSE-NEXT: movdqa %xmm10, 112(%rdx) -; SSE-NEXT: movdqa %xmm14, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movdqa %xmm6, 96(%rcx) -; SSE-NEXT: movdqa %xmm11, 112(%rcx) -; SSE-NEXT: movdqa %xmm0, 64(%rcx) -; SSE-NEXT: movdqa %xmm15, 80(%rcx) -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm4, 48(%rcx) -; SSE-NEXT: movdqa %xmm9, (%rcx) -; SSE-NEXT: movdqa %xmm7, 16(%rcx) -; SSE-NEXT: addq $408, %rsp # imm = 0x198 +; SSE-NEXT: movdqa %xmm15, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm13, 96(%rcx) +; SSE-NEXT: movdqa %xmm9, 112(%rcx) +; SSE-NEXT: movdqa %xmm11, 64(%rcx) +; SSE-NEXT: movdqa %xmm14, 80(%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm5, 48(%rcx) +; SSE-NEXT: movdqa %xmm8, (%rcx) +; SSE-NEXT: movdqa %xmm4, 16(%rcx) +; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2],xmm0[3,4],xmm10[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm0[2],xmm15[3,4],xmm0[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm15[2],xmm11[3,4],xmm15[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm5[1],xmm8[2,3],xmm5[4],xmm8[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1],xmm12[2,3],xmm3[4],xmm12[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm10[0,1],mem[2],xmm10[3,4],mem[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm15[2],mem[3,4],xmm15[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[0,1],mem[2],xmm8[3,4],mem[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm13[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm1[0,1],mem[2],xmm1[3,4],mem[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm5[0,1],mem[2],xmm5[3,4],mem[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm8[2],xmm12[3,4],xmm8[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm3[2],xmm9[3,4],xmm3[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm13[2],xmm15[3,4],xmm13[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm8[2],mem[3,4],xmm8[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm9[2],mem[3,4],xmm9[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm15[1],xmm13[2,3],xmm15[4],xmm13[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm3[1],mem[2,3],xmm3[4],mem[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd $230, (%rsp), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0],mem[1],xmm3[2,3],mem[4],xmm3[5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3],xmm6[4],mem[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rdx) +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3],mem[4],xmm7[5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm8[1],mem[2,3],xmm8[4],mem[5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%rcx) -; AVX1-ONLY-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride3_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $168, %rsp +; AVX2-ONLY-NEXT: subq $136, %rsp ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7],ymm0[8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13,14],ymm4[15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm15 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm12, %ymm11, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm12, %ymm11, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm9, %ymm7, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm9, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm5, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm11 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2,3],ymm0[4],ymm15[5,6],ymm0[7],ymm15[8],ymm0[9],ymm15[10,11],ymm0[12],ymm15[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm15[2],xmm6[3,4],xmm15[5],xmm6[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm10[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3],ymm0[4],ymm10[5,6],ymm0[7],ymm10[8],ymm0[9],ymm10[10,11],ymm0[12],ymm10[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm10[2],xmm3[3,4],xmm10[5],xmm3[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6],ymm0[7],ymm11[8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm1, %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7,8,9],ymm4[10],ymm11[11,12],ymm4[13],ymm11[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm6[2],xmm15[3,4],xmm6[5],xmm15[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm11[3,4,5,6,7],ymm4[8,9,10],ymm11[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7,8,9],ymm11[10],ymm9[11,12],ymm11[13],ymm9[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm11[1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7],ymm8[8],ymm11[9,10],ymm8[11],ymm11[12,13],ymm8[14],ymm11[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm14[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7],ymm8[8],ymm14[9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm15[1],xmm6[2,3],xmm15[4],xmm6[5,6],xmm15[7] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm14[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7],ymm8[8],ymm14[9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rcx) -; AVX2-ONLY-NEXT: addq $168, %rsp +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2162,17 +2159,17 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm5 ; AVX512F-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] ; AVX512F-NEXT: vmovdqa %xmm2, %xmm14 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm22 ; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm23 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm6 @@ -2189,35 +2186,35 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX512F-NEXT: vpshufb %xmm15, %xmm13, %xmm13 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm18 +; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm10 -; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13,14],ymm12[15] -; AVX512F-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] +; AVX512F-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm11 ; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512F-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm10[3,4,5,6,7] -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm5 -; AVX512F-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13,14],ymm10[15] -; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm2 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm5[2],xmm10[3,4],xmm5[5],xmm10[6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX512F-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 +; AVX512F-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] ; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] @@ -2248,7 +2245,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] ; AVX512F-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm17 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512F-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] @@ -2260,11 +2257,11 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 ; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] ; AVX512F-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] @@ -2274,12 +2271,12 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512F-NEXT: vpternlogq $226, %ymm16, %ymm0, %ymm15 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm15[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12,13],ymm3[14],ymm15[15] +; AVX512F-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6],xmm5[7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -2315,9 +2312,9 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 9b347e01e92d7..6670de17c6cde 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -549,15 +549,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm8 ; SSE-NEXT: movdqa 16(%rdi), %xmm10 ; SSE-NEXT: movdqa 32(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7] @@ -578,7 +578,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,1,1,3,4,5,6,7] @@ -588,30 +588,30 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7] @@ -627,13 +627,13 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movapd %xmm4, 16(%rsi) ; SSE-NEXT: movapd %xmm7, (%rsi) -; SSE-NEXT: movapd %xmm8, 16(%rdx) +; SSE-NEXT: movapd %xmm9, 16(%rdx) ; SSE-NEXT: movapd %xmm5, (%rdx) ; SSE-NEXT: movapd %xmm15, 16(%rcx) ; SSE-NEXT: movapd %xmm6, (%rcx) @@ -1189,15 +1189,15 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1212,17 +1212,17 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -1232,8 +1232,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1255,26 +1255,27 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: movapd %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm14[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] @@ -1284,29 +1285,29 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] @@ -1314,26 +1315,25 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -1343,48 +1343,47 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1394,204 +1393,211 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movapd %xmm12, 32(%rdx) -; SSE-NEXT: movapd %xmm8, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm15, 32(%rcx) -; SSE-NEXT: movapd %xmm9, (%rcx) -; SSE-NEXT: movapd %xmm13, 48(%rcx) +; SSE-NEXT: movapd %xmm1, 32(%rcx) +; SSE-NEXT: movapd %xmm8, (%rcx) +; SSE-NEXT: movapd %xmm15, 48(%rcx) ; SSE-NEXT: movapd %xmm10, 16(%rcx) -; SSE-NEXT: movapd %xmm2, 32(%r8) +; SSE-NEXT: movapd %xmm3, 32(%r8) ; SSE-NEXT: movapd %xmm7, (%r8) ; SSE-NEXT: movapd %xmm14, 48(%r8) -; SSE-NEXT: movapd %xmm3, 16(%r8) +; SSE-NEXT: movapd %xmm11, 16(%r8) ; SSE-NEXT: addq $248, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $232, %rsp -; AVX1-ONLY-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm5[1,2,3],xmm11[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm6[1,2,3],xmm11[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm5[1,2,3],xmm15[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm6[1,2,3],xmm10[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm6[1,2,3],xmm12[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm5[1,2,3],xmm13[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm5[1,2,3],xmm14[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm5[1,2,3],xmm9[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm6[1,2,3],xmm14[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm6[1,2,3],xmm13[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3],xmm8[4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm15, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1599,33 +1605,33 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -1640,18 +1646,18 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: addq $232, %rsp +; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $184, %rsp +; AVX2-SLOW-NEXT: subq $168, %rsp ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1693,141 +1699,139 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1835,36 +1839,36 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1874,26 +1878,25 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $184, %rsp +; AVX2-SLOW-NEXT: addq $168, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $136, %rsp +; AVX2-FAST-NEXT: subq $104, %rsp ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -1903,13 +1906,14 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1920,131 +1924,132 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm1 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm13 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-FAST-NEXT: addq $136, %rsp +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $168, %rsp +; AVX2-FAST-PERLANE-NEXT: subq $184, %rsp ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -2086,185 +2091,187 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm15, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $168, %rsp -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX512F-SLOW-LABEL: load_i16_stride4_vf32: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512F-SLOW-LABEL: load_i16_stride4_vf32: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm6 @@ -2544,7 +2551,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -2630,7 +2637,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -2793,643 +2800,645 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm6[0],xmm13[1] +; SSE-NEXT: pshuflw $231, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm5[0],xmm12[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm5[0],xmm15[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshuflw $116, (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 96(%rcx) -; SSE-NEXT: movapd %xmm7, 32(%rcx) -; SSE-NEXT: movapd %xmm10, 112(%rcx) -; SSE-NEXT: movapd %xmm13, 48(%rcx) +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movapd %xmm2, 96(%rcx) +; SSE-NEXT: movapd %xmm5, 32(%rcx) +; SSE-NEXT: movapd %xmm9, 112(%rcx) +; SSE-NEXT: movapd %xmm10, 48(%rcx) ; SSE-NEXT: movapd %xmm14, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movapd %xmm9, 112(%r8) -; SSE-NEXT: movapd %xmm6, 96(%r8) -; SSE-NEXT: movapd %xmm0, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movapd %xmm7, 112(%r8) +; SSE-NEXT: movapd %xmm4, 96(%r8) +; SSE-NEXT: movapd %xmm3, 80(%r8) ; SSE-NEXT: movapd %xmm15, 64(%r8) ; SSE-NEXT: movapd %xmm12, 48(%r8) -; SSE-NEXT: movapd %xmm1, 32(%r8) -; SSE-NEXT: movapd %xmm2, 16(%r8) -; SSE-NEXT: movapd %xmm4, (%r8) +; SSE-NEXT: movapd %xmm13, 32(%r8) +; SSE-NEXT: movapd %xmm8, 16(%r8) +; SSE-NEXT: movapd %xmm0, (%r8) ; SSE-NEXT: addq $824, %rsp # imm = 0x338 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $824, %rsp # imm = 0x338 -; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm7[1,2,3],xmm4[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $776, %rsp # imm = 0x308 +; AVX1-ONLY-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm10[1,2,3],xmm8[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2,3],xmm2[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm10[1,2,3],xmm4[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm10[1,2,3],xmm7[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm7[1,2,3],xmm15[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm7[1,2,3],xmm13[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm10[1,2,3],xmm13[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm7[1,2,3],xmm14[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm10[1,2,3],xmm14[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm7[1,2,3],xmm3[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm7[1,2,3],xmm5[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm8, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0],xmm7[1,2,3],xmm8[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm10[1,2,3],xmm15[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm7[1,2,3],xmm6[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm7[1,2,3],xmm6[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm7[1,2,3],xmm12[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1,2,3],xmm11[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm10[1,2,3],xmm5[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -3470,12 +3479,12 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] @@ -3508,8 +3517,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] @@ -3536,7 +3544,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] @@ -3546,8 +3555,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3588,7 +3597,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) -; AVX1-ONLY-NEXT: addq $824, %rsp # imm = 0x338 +; AVX1-ONLY-NEXT: addq $776, %rsp # imm = 0x308 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3681,7 +3690,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 @@ -3695,7 +3704,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 @@ -3739,20 +3748,20 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3792,45 +3801,45 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] @@ -3839,36 +3848,36 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] @@ -3877,182 +3886,171 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] @@ -4062,47 +4060,58 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, (%r8) ; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 @@ -4120,11 +4129,10 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4135,235 +4143,232 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm8 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm15 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm15 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm14 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm10 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, (%rsp), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] @@ -4372,87 +4377,90 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-FAST-PERLANE-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -4534,266 +4542,266 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -4801,8 +4809,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4826,11 +4834,11 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4844,376 +4852,369 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride4_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $88, %rsp -; AVX512F-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm26[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm17[0,2,2,3] +; AVX512F-SLOW-NEXT: subq $104, %rsp +; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm27[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm21[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm19[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm19 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm19[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm21[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm17 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm24 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 336(%rdi), %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm26 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm29[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm25[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm24 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm26[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm21[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm16[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm17[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[3,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm29[3,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm18[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm27[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm19[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm20[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm25[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpmovqw %ymm1, %xmm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm13[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512F-SLOW-NEXT: vpmovqw %ymm14, %xmm14 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 -; AVX512F-SLOW-NEXT: vpmovqw %zmm21, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm12[0,1,2,3],zmm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm13 -; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm2, %zmm13 -; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm21, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm12 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512F-SLOW-NEXT: vpmovqw %zmm29, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpmovqw %ymm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm25 +; AVX512F-SLOW-NEXT: vpmovqw %zmm25, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512F-SLOW-NEXT: vpmovqw %zmm18, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm29, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm25, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm18, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm29, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm25, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm2, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm18, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm8[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm29, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm25, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm18, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm4, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512F-SLOW-NEXT: addq $88, %rsp +; AVX512F-SLOW-NEXT: addq $104, %rsp ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 8bda8ab81eac6..30ee74b335752 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -489,35 +489,35 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,3] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,0,1] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] @@ -527,64 +527,65 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm7, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm7 ; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[0,1,1,3] -; SSE-NEXT: psllq $48, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,1,1,3] +; SSE-NEXT: psllq $48, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pandn %xmm6, %xmm13 -; SSE-NEXT: por %xmm13, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[3,0] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,0] -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movdqa %xmm4, (%rsi) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movaps %xmm12, (%rcx) -; SSE-NEXT: movaps %xmm14, (%r8) -; SSE-NEXT: movaps %xmm2, (%r9) +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm7[2,0] +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] +; SSE-NEXT: movdqa %xmm2, (%rsi) +; SSE-NEXT: movdqa %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm11, (%r8) +; SSE-NEXT: movaps %xmm3, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf8: @@ -907,72 +908,63 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm9 -; SSE-NEXT: movdqa 64(%rdi), %xmm12 -; SSE-NEXT: movdqa (%rdi), %xmm7 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa 32(%rdi), %xmm11 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm14 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa 96(%rdi), %xmm7 +; SSE-NEXT: movdqa 128(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] -; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm7, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1] +; SSE-NEXT: movaps %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: andps %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,1] -; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: andps %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,1] +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -980,23 +972,23 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: andnps %xmm2, %xmm13 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1005,43 +997,42 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; SSE-NEXT: pand %xmm3, %xmm15 ; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,3] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] ; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] @@ -1049,76 +1040,72 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm11[3,0] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[3,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[0,1,0,3] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: por %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: por %xmm14, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm11[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm9[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm12, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,4,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] ; SSE-NEXT: por %xmm2, %xmm8 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] @@ -1128,15 +1115,15 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movdqa %xmm7, 16(%rdx) -; SSE-NEXT: movaps %xmm13, (%rdx) +; SSE-NEXT: movdqa %xmm6, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movaps %xmm11, 16(%r8) +; SSE-NEXT: movaps %xmm13, 16(%r8) ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps %xmm3, 16(%r9) ; SSE-NEXT: movaps %xmm8, (%r9) -; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf16: @@ -1759,86 +1746,89 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $408, %rsp # imm = 0x198 ; SSE-NEXT: movdqa 64(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa 224(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 176(%rdi), %xmm10 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 +; SSE-NEXT: movdqa 176(%rdi), %xmm12 ; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm11 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm10, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,3] -; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 272(%rdi), %xmm7 -; SSE-NEXT: andps %xmm10, %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 240(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm10, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm1 @@ -1853,30 +1843,30 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: andps %xmm10, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm2 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] @@ -1885,25 +1875,22 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movaps %xmm15, %xmm3 ; SSE-NEXT: andnps %xmm2, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1911,24 +1898,25 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1939,19 +1927,19 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1962,331 +1950,339 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0,1,3] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: andnps %xmm14, %xmm1 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: andnps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm13[3,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: andnps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[3,0] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: andnps %xmm8, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[3,0] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm11, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0] +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm14[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,2] +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm11[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] +; SSE-NEXT: por %xmm12, %xmm2 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,0] +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0] +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps %xmm12, 16(%r8) -; SSE-NEXT: movaps %xmm6, 48(%r8) -; SSE-NEXT: movaps %xmm9, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r8) -; SSE-NEXT: movaps %xmm3, 16(%r9) -; SSE-NEXT: movaps %xmm14, 48(%r9) -; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: movaps %xmm1, 32(%r9) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm10, 16(%r8) +; SSE-NEXT: movaps %xmm9, 48(%r8) +; SSE-NEXT: movaps %xmm13, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps %xmm11, 16(%r9) +; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm2, (%r9) +; SSE-NEXT: movaps %xmm3, 32(%r9) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2300,15 +2296,15 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] @@ -2317,234 +2313,237 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm6, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm10[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm12[4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpsllq $48, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm11[2,3],xmm15[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm10[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,2,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm3[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm13[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3],xmm3[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm12[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm9[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm12[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -2553,13 +2552,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 @@ -2568,174 +2567,171 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $280, %rsp # imm = 0x118 +; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4,5],xmm10[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4],ymm8[5],ymm3[6,7],ymm8[8],ymm3[9,10],ymm8[11],ymm3[12],ymm8[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1,2,3],xmm11[4,5],xmm13[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4],ymm3[5],ymm12[6,7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12],ymm3[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10,11],ymm8[12],ymm3[13],ymm8[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3],xmm0[4,5,6],xmm13[7] -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm3[0],xmm15[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2,3,4],ymm11[5,6,7],ymm9[8,9,10,11,12],ymm11[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm15[2],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7],ymm14[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm13[2],xmm7[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm8[2],ymm5[3],ymm8[4],ymm5[5,6],ymm8[7],ymm5[8,9],ymm8[10],ymm5[11],ymm8[12],ymm5[13,14],ymm8[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1],ymm6[2],ymm1[3],ymm6[4],ymm1[5,6],ymm6[7],ymm1[8,9],ymm6[10],ymm1[11],ymm6[12],ymm1[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm7[1],xmm13[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7,8],ymm2[9],ymm11[10],ymm2[11],ymm11[12,13],ymm2[14],ymm11[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm12[2],xmm15[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm1[1,2],ymm6[3],ymm1[4],ymm6[5],ymm1[6,7],ymm6[8],ymm1[9,10],ymm6[11],ymm1[12],ymm6[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] @@ -2744,215 +2740,217 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) -; AVX2-SLOW-NEXT: addq $280, %rsp # imm = 0x118 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) +; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm12, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm13, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm10, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm11, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7],ymm10[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5],ymm7[6],ymm3[7,8],ymm7[9],ymm3[10,11],ymm7[12],ymm3[13],ymm7[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10,11],ymm12[12],ymm4[13],ymm12[14],ymm4[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4],ymm8[5],ymm6[6,7],ymm8[8],ymm6[9,10],ymm8[11],ymm6[12],ymm8[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm3[2],ymm7[3],ymm3[4],ymm7[5,6],ymm3[7],ymm7[8,9],ymm3[10],ymm7[11],ymm3[12],ymm7[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> @@ -2964,20 +2962,20 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15] +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm14[2],ymm12[3],ymm14[4],ymm12[5,6],ymm14[7],ymm12[8,9],ymm14[10],ymm12[11],ymm14[12],ymm12[13,14],ymm14[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -2992,233 +2990,230 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm15, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5],ymm2[6],ymm7[7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13],ymm2[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1,2,3],xmm1[4,5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4],ymm11[5],ymm6[6,7],ymm11[8],ymm6[9,10],ymm11[11],ymm6[12],ymm11[13],ymm6[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13],ymm10[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm9, %ymm8, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm13, %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5],ymm11[6],ymm15[7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13],ymm11[14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7],ymm7[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm8[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7],ymm14[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm8[0],xmm9[1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11],ymm10[12],ymm6[13],ymm10[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm12[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1],ymm5[2],mem[3],ymm5[4],mem[5,6],ymm5[7],mem[8,9],ymm5[10],mem[11],ymm5[12],mem[13,14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4],ymm7[5],ymm3[6,7],ymm7[8],ymm3[9,10],ymm7[11],ymm3[12],ymm7[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10],ymm6[11],ymm10[12,13],ymm6[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm6[2],ymm10[3],ymm6[4],ymm10[5,6],ymm6[7],ymm10[8,9],ymm6[10],ymm10[11],ymm6[12],ymm10[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -3657,173 +3652,170 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1000, %rsp # imm = 0x3E8 -; SSE-NEXT: movdqa 464(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm10 -; SSE-NEXT: movdqa 416(%rdi), %xmm11 -; SSE-NEXT: movdqa 448(%rdi), %xmm5 +; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 +; SSE-NEXT: movdqa 464(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa 400(%rdi), %xmm8 +; SSE-NEXT: movdqa 416(%rdi), %xmm11 +; SSE-NEXT: movdqa 448(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa 96(%rdi), %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa 128(%rdi), %xmm14 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 352(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 336(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 320(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 592(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa 576(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm6, %xmm4 +; SSE-NEXT: andps %xmm13, %xmm4 ; SSE-NEXT: orps %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm2 @@ -3832,27 +3824,27 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: andps %xmm13, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 528(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] @@ -3867,802 +3859,800 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: movdqa 480(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: psllq $48, %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm3, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: psllq $48, %xmm4 +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: andnps %xmm4, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: psllq $48, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[2,3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm4 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: andnps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,0] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm13, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm10, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm14, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm12, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm7, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: andnps %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm13[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm12[0,2] -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[0,2] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm1, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm14[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,7,4,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: pshufhw $232, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: andnps %xmm11, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, (%rsp), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,0] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0] -; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm12[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm9, 96(%r8) -; SSE-NEXT: movaps %xmm11, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%r8) -; SSE-NEXT: movaps %xmm15, 112(%r9) -; SSE-NEXT: movaps %xmm0, 96(%r9) -; SSE-NEXT: movaps %xmm1, 80(%r9) -; SSE-NEXT: movaps %xmm2, 64(%r9) -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps %xmm5, 32(%r9) -; SSE-NEXT: movaps %xmm6, 16(%r9) -; SSE-NEXT: movaps %xmm7, (%r9) -; SSE-NEXT: addq $1000, %rsp # imm = 0x3E8 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: orps %xmm9, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%r8) +; SSE-NEXT: movaps %xmm14, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps %xmm6, 112(%r9) +; SSE-NEXT: movaps %xmm2, 96(%r9) +; SSE-NEXT: movaps %xmm3, 80(%r9) +; SSE-NEXT: movaps %xmm4, 64(%r9) +; SSE-NEXT: movaps %xmm5, 48(%r9) +; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm7, 16(%r9) +; SSE-NEXT: movaps %xmm8, (%r9) +; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf64: @@ -4679,13 +4669,12 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] @@ -4695,9 +4684,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4705,12 +4694,12 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4718,10 +4707,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] @@ -4730,7 +4718,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] @@ -4738,74 +4726,75 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4831,144 +4820,143 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2,3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm11[2,3],mem[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, (%rsp), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm9, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpsllq $48, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm12[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1,2,3],xmm12[4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4976,101 +4964,101 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm11[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm8[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm7[0,1,2,3],mem[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm10[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,2,0] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm14[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5079,104 +5067,104 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm7[2,3],xmm9[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1,2,3],xmm14[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm4[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm12[2,3],mem[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm9[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5196,115 +5184,114 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,1,1,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm12[0,1,2],mem[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -5330,413 +5317,392 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm12, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1064, %rsp # imm = 0x428 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX2-SLOW-NEXT: subq $1048, %rsp # imm = 0x418 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4],ymm13[5],ymm9[6,7],ymm13[8],ymm9[9,10],ymm13[11],ymm9[12],ymm13[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3],ymm4[4],ymm0[5],ymm4[6,7],ymm0[8],ymm4[9,10],ymm0[11],ymm4[12],ymm0[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2],mem[3],ymm1[4,5],mem[6],ymm1[7,8],mem[9],ymm1[10],mem[11],ymm1[12,13],mem[14],ymm1[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10,11],ymm13[12],ymm9[13],ymm13[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm15[1],ymm7[2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10],ymm15[11],ymm7[12,13],ymm15[14],ymm7[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6],xmm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm4[0],mem[1],ymm4[2],mem[3],ymm4[4,5],mem[6],ymm4[7,8],mem[9],ymm4[10],mem[11],ymm4[12,13],mem[14],ymm4[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm8, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm5[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm10 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm13 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1,2,3,4],ymm13[5,6,7],ymm2[8,9,10,11,12],ymm13[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm15, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm4[2],xmm15[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm12[2],xmm10[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm9[2],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5],ymm5[6],ymm15[7,8],ymm5[9],ymm15[10,11],ymm5[12],ymm15[13],ymm5[14],ymm15[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm6[2],ymm11[3],ymm6[4],ymm11[5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11],ymm6[12],ymm11[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0],xmm13[1],xmm14[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0],xmm12[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[0],xmm4[1],mem[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm12[1],mem[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm10[1],mem[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm13[1,2],ymm9[3],ymm13[4],ymm9[5],ymm13[6,7],ymm9[8],ymm13[9,10],ymm9[11],ymm13[12],ymm9[13],ymm13[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm15[2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm14[1],ymm10[2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10],ymm14[11],ymm10[12,13],ymm14[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm12[1,2],mem[3],ymm12[4],mem[5],ymm12[6,7],mem[8],ymm12[9,10],mem[11],ymm12[12],mem[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm13[0,1],mem[2],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm12[0],mem[1],ymm12[2],mem[3],ymm12[4,5],mem[6],ymm12[7,8],mem[9],ymm12[10],mem[11],ymm12[12,13],mem[14],ymm12[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm8[1,2],ymm14[3],ymm8[4],ymm14[5],ymm8[6,7],ymm14[8],ymm8[9,10],ymm14[11],ymm8[12],ymm14[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm15[2],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm11[2],mem[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5],ymm9[6],mem[7,8],ymm9[9],mem[10,11],ymm9[12],mem[13],ymm9[14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] @@ -5744,475 +5710,496 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm10[0,1],mem[2],ymm10[3],mem[4],ymm10[5,6],mem[7],ymm10[8,9],mem[10],ymm10[11],mem[12],ymm10[13,14],mem[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $1064, %rsp # imm = 0x428 +; AVX2-SLOW-NEXT: addq $1048, %rsp # imm = 0x418 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm9 +; AVX2-FAST-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1,2],ymm14[3],ymm4[4],ymm14[5],ymm4[6,7],ymm14[8],ymm4[9,10],ymm14[11],ymm4[12],ymm14[13],ymm4[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,0,2,4,6,1,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13],ymm4[14],ymm9[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5],ymm0[6],ymm6[7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13],ymm0[14],ymm6[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10],ymm1[11],ymm6[12],ymm1[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10],ymm10[11],ymm13[12,13],ymm10[14],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5,6],xmm10[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6],xmm11[7] -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm5[0],mem[1],ymm5[2],mem[3],ymm5[4,5],mem[6],ymm5[7,8],mem[9],ymm5[10],mem[11],ymm5[12,13],mem[14],ymm5[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3],xmm11[4,5,6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm1[5,6,7],ymm10[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, (%rsp), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5],mem[6],ymm12[7,8],mem[9],ymm12[10,11],mem[12],ymm12[13],mem[14],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm15[2],ymm14[3],ymm15[4],ymm14[5,6],ymm15[7],ymm14[8,9],ymm15[10],ymm14[11],ymm15[12],ymm14[13,14],ymm15[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13],ymm15[14],ymm14[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm8[2],ymm4[3],ymm8[4],ymm4[5,6],ymm8[7],ymm4[8,9],ymm8[10],ymm4[11],ymm8[12],ymm4[13,14],ymm8[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw $214, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm3[0],mem[1,2],ymm3[3],mem[4],ymm3[5],mem[6,7],ymm3[8],mem[9,10],ymm3[11],mem[12],ymm3[13],mem[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7,8],ymm12[9],ymm5[10],ymm12[11],ymm5[12,13],ymm12[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm3[1,2],mem[3],ymm3[4],mem[5],ymm3[6,7],mem[8],ymm3[9,10],mem[11],ymm3[12],mem[13],ymm3[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4],ymm8[5],ymm4[6,7],ymm8[8],ymm4[9,10],ymm8[11],ymm4[12],ymm8[13],ymm4[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,u,u,6,0,3,5> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) @@ -6234,36 +6221,35 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-FAST-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: subq $1080, %rsp # imm = 0x438 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0 @@ -6272,420 +6258,428 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4],ymm15[5],ymm8[6,7],ymm15[8],ymm8[9,10],ymm15[11],ymm8[12],ymm15[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10,11],ymm9[12],ymm5[13],ymm9[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm4[1,2],ymm12[3],ymm4[4],ymm12[5],ymm4[6,7],ymm12[8],ymm4[9,10],ymm12[11],ymm4[12],ymm12[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4],ymm10[5],ymm0[6,7],ymm10[8],ymm0[9,10],ymm10[11],ymm0[12],ymm10[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4,5],ymm1[6],mem[7,8],ymm1[9],mem[10],ymm1[11],mem[12,13],ymm1[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm9[0],mem[1],ymm9[2],mem[3],ymm9[4,5],mem[6],ymm9[7,8],mem[9],ymm9[10],mem[11],ymm9[12,13],mem[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13],ymm12[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,3],xmm9[4,5,6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5],ymm10[6],mem[7,8],ymm10[9],mem[10,11],ymm10[12],mem[13],ymm10[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,3],xmm9[4,5,6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm12[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm13[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm2[0],xmm6[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm15[2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm12[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm13[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm6[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm10[0],xmm14[1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5],ymm6[6],mem[7,8],ymm6[9],mem[10,11],ymm6[12],mem[13],ymm6[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm9[1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm15[0],mem[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $41, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1,2],mem[3],ymm1[4],mem[5],ymm1[6,7],mem[8],ymm1[9,10],mem[11],ymm1[12],mem[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm1[0,1],mem[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm6[0],mem[1],ymm6[2],mem[3],ymm6[4,5],mem[6],ymm6[7,8],mem[9],ymm6[10],mem[11],ymm6[12,13],mem[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm8[1,2],mem[3],ymm8[4],mem[5],ymm8[6,7],mem[8],ymm8[9,10],mem[11],ymm8[12],mem[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0],xmm5[1],xmm11[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0,1],mem[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm14[1],mem[2],ymm14[3],mem[4,5],ymm14[6],mem[7,8],ymm14[9],mem[10],ymm14[11],mem[12,13],ymm14[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0],xmm5[1],xmm11[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm12[0,1],mem[2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm10[1],ymm8[2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm4[0,1],mem[2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm13[1,2],ymm11[3],ymm13[4],ymm11[5],ymm13[6,7],ymm11[8],ymm13[9,10],ymm11[11],ymm13[12],ymm11[13],ymm13[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm7[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5],ymm5[6],mem[7,8],ymm5[9],mem[10,11],ymm5[12],mem[13],ymm5[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3,4],xmm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm10[2],ymm8[3],ymm10[4],ymm8[5,6],ymm10[7],ymm8[8,9],ymm10[10],ymm8[11],ymm10[12],ymm8[13,14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3],ymm6[4],mem[5,6],ymm6[7],mem[8,9],ymm6[10],mem[11],ymm6[12],mem[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -6715,28 +6709,29 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: addq $1080, %rsp # imm = 0x438 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] ; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm7 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] @@ -6745,92 +6740,91 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm9 ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm23 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm19 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 176(%rdi), %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4],ymm3[5],ymm10[6,7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12],ymm3[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm10, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3],ymm12[4],ymm0[5,6],ymm12[7],ymm0[8,9],ymm12[10],ymm0[11],ymm12[12],ymm0[13,14],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm22 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm24 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm7[2],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm12[1],ymm3[2,3],ymm12[4],ymm3[5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10,11],ymm12[12],ymm3[13],ymm12[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm5 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 @@ -6841,782 +6835,792 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0],xmm11[1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm13[2],ymm1[3],ymm13[4],ymm1[5,6],ymm13[7],ymm1[8,9],ymm13[10],ymm1[11],ymm13[12],ymm1[13,14],ymm13[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm13[1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[3,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm28 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm28[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm25, %zmm24 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7],ymm12[8,9],ymm15[10],ymm12[11],ymm15[12],ymm12[13,14],ymm15[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm12[1],ymm3[2,3],ymm12[4],ymm3[5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10,11],ymm12[12],ymm3[13],ymm12[14],ymm3[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1,2],ymm15[3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1,2],ymm7[3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm19[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm24 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm21[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm24[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm26[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm10[1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm11[2],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm13[2],xmm10[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10,11],ymm11[12],ymm6[13],ymm11[14],ymm6[15] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm16, %xmm15 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm11[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm6[1,2],ymm13[3],ymm6[4],ymm13[5],ymm6[6,7],ymm13[8],ymm6[9,10],ymm13[11],ymm6[12],ymm13[13],ymm6[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2],ymm8[3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm28[0,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm27, %xmm13 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4],ymm6[5],ymm3[6,7],ymm6[8],ymm3[9,10],ymm6[11],ymm3[12],ymm6[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm3[1],ymm12[2,3],ymm3[4],ymm12[5],ymm3[6],ymm12[7,8],ymm3[9],ymm12[10,11],ymm3[12],ymm12[13],ymm3[14],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm26 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2],ymm11[3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm21[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm19, %xmm13 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm10[2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm26[0,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm20 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm24, %xmm8 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm25, %zmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm7[2],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5],ymm5[6],ymm11[7,8],ymm5[9],ymm11[10,11],ymm5[12],ymm11[13],ymm5[14],ymm11[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1],ymm14[2],ymm10[3],ymm14[4],ymm10[5,6],ymm14[7],ymm10[8,9],ymm14[10],ymm10[11],ymm14[12],ymm10[13,14],ymm14[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm8[1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3,4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm8[1],xmm12[2,3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm31 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm16[2],xmm12[3],xmm16[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3,4],xmm13[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm27[2],xmm15[3],xmm27[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm12 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm24[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm31 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm20[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $148, (%rsp), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm11 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm13 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm10[1,2],ymm14[3],ymm10[4],ymm14[5],ymm10[6,7],ymm14[8],ymm10[9,10],ymm14[11],ymm10[12],ymm14[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm0[2],xmm7[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1,2,3],xmm2[4,5],xmm9[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6,7],ymm9[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10,11],ymm14[12],ymm10[13],ymm14[14],ymm10[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpblendw $173, (%rsp), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6],ymm12[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm6[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm24 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm11[1],ymm5[2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7,8],ymm11[9],ymm5[10],ymm11[11],ymm5[12,13],ymm11[14],ymm5[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm1[2],ymm4[3],ymm1[4],ymm4[5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11],ymm1[12],ymm4[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm1[1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm5[2],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4,5],xmm7[6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm21 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 ; AVX512F-SLOW-NEXT: movb $7, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5,6],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4],ymm3[5],ymm0[6,7],ymm3[8],ymm0[9,10],ymm3[11],ymm0[12],ymm3[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $82, (%rsp), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6],xmm6[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $148, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512F-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-SLOW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm9 -; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX512F-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 ; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 ; AVX512F-FAST-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0,1],ymm6[2],ymm11[3],ymm6[4],ymm11[5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11],ymm6[12],ymm11[13,14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,4,7,1,4,6,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm17, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [8,9,3,2,4,5,7,6] -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm19, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,4,7,1,4,6,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] +; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,3,1,3,0,3,5,7] -; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm27 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm21, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm25, %zmm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] +; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm29 +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm4 ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm4[1,2],ymm15[3],ymm4[4],ymm15[5],ymm4[6,7],ymm15[8],ymm4[9,10],ymm15[11],ymm4[12],ymm15[13],ymm4[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm15 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm11 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %ymm20, %ymm19, %ymm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm21, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10],ymm10[11],ymm7[12,13],ymm10[14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm30 +; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <2,u,u,u,4,7,1,6> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10,11],ymm15[12],ymm6[13],ymm15[14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm22, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,u,u,u,4,7,1,6> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm21, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vporq %ymm3, %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm9 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,2,5,7,4,7,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13],ymm12[14],ymm14[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm21, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vporq %ymm1, %ymm0, %ymm18 +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm31, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm11 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <0,2,5,7,4,7,u,u> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm20, %ymm10 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,3,1,4,6,3] -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm12 -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] +; AVX512F-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] +; AVX512F-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,3,2,3,1,3,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm25, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm26, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-FAST-NEXT: vpblendw $74, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = ymm4[0],mem[1],ymm4[2],mem[3],ymm4[4,5],mem[6],ymm4[7,8],mem[9],ymm4[10],mem[11],ymm4[12,13],mem[14],ymm4[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm23 -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm22, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm21, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm29, %xmm4 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-FAST-NEXT: vpblendw $181, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm28 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm21, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm20, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm27, %xmm4 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512F-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4],ymm4[5],ymm6[6,7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12],ymm4[13],ymm6[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4],ymm6[5],ymm8[6,7],ymm6[8],ymm8[9,10],ymm6[11],ymm8[12],ymm6[13],ymm8[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm16, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm25, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm21 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10,11],ymm5[12],ymm12[13],ymm5[14],ymm12[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm30 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,3,5,2,5,7,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7,8],ymm14[9],ymm7[10],ymm14[11],ymm7[12,13],ymm14[14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <0,2,u,u,5,7,2,4> +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm25, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm5, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm24, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm14 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3],ymm8[4],ymm0[5,6],ymm8[7],ymm0[8,9],ymm8[10],ymm0[11],ymm8[12],ymm0[13,14],ymm8[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <0,2,u,u,5,7,2,4> -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm26, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3,4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,4,6,0,1,4,6,0] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm13, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm25, %zmm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm24, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm10 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm29[2],xmm2[3],xmm29[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm15[2],ymm3[3],ymm15[4],ymm3[5,6],ymm15[7],ymm3[8,9],ymm15[10],ymm3[11],ymm15[12],ymm3[13,14],ymm15[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm29 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm19 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm4[2],ymm7[3],ymm4[4],ymm7[5,6],ymm4[7],ymm7[8,9],ymm4[10],ymm7[11],ymm4[12],ymm7[13,14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm22 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5],ymm4[6],ymm13[7,8],ymm4[9],ymm13[10,11],ymm4[12],ymm13[13],ymm4[14],ymm13[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm25, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm21, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm7[1],xmm13[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <1,3,6,0,5,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,6,0,5,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm24 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm30 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm6[2],ymm8[3],ymm6[4],ymm8[5,6],ymm6[7],ymm8[8,9],ymm6[10],ymm8[11],ymm6[12],ymm8[13,14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm7[2],xmm13[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm9[1,2],ymm14[3],ymm9[4],ymm14[5],ymm9[6,7],ymm14[8],ymm9[9,10],ymm14[11],ymm9[12],ymm14[13],ymm9[14,15] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm8[2],ymm6[3],ymm8[4],ymm6[5,6],ymm8[7],ymm6[8,9],ymm8[10],ymm6[11],ymm8[12],ymm6[13,14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm20 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm9[2],xmm7[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <1,4,6,3,6,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm26, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm17 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,3,u,u,5,0,2,7> -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm26, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm15[1,2],ymm4[3],ymm15[4],ymm4[5],ymm15[6,7],ymm4[8],ymm15[9,10],ymm4[11],ymm15[12],ymm4[13],ymm15[14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,3,u,u,5,0,2,7> +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm10[1],ymm5[2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10],ymm10[11],ymm5[12,13],ymm10[14],ymm5[15] -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm8, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0],xmm13[1,2,3],xmm3[4,5],xmm13[6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm24, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm12[1,2],ymm15[3],ymm12[4],ymm15[5],ymm12[6,7],ymm15[8],ymm12[9,10],ymm15[11],ymm12[12],ymm15[13],ymm12[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm3[1],ymm15[2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10],ymm3[11],ymm15[12,13],ymm3[14],ymm15[15] -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm11[1,2],ymm9[3],ymm11[4],ymm9[5],ymm11[6,7],ymm9[8],ymm11[9,10],ymm9[11],ymm11[12],ymm9[13],ymm11[14,15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm26, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm27, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm26, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm2 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm25, %zmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm10[2],ymm5[3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8,9],ymm10[10],ymm5[11],ymm10[12],ymm5[13,14],ymm10[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,3,u,u,6,0,3,5> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm3 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm12[2],ymm10[3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8,9],ymm12[10],ymm10[11],ymm12[12],ymm10[13,14],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5> +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5],ymm6[6],ymm15[7,8],ymm6[9],ymm15[10,11],ymm6[12],ymm15[13],ymm6[14],ymm15[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] ; AVX512F-FAST-NEXT: movb $7, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm6, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2],ymm5[3],mem[4,5],ymm5[6],mem[7,8],ymm5[9],mem[10],ymm5[11],mem[12,13],ymm5[14],mem[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6],xmm8[7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm6 = mem[0],ymm14[1],mem[2],ymm14[3],mem[4,5],ymm14[6],mem[7,8],ymm14[9],mem[10],ymm14[11],mem[12,13],ymm14[14],mem[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0,1],ymm3[2],ymm15[3],ymm3[4],ymm15[5,6],ymm3[7],ymm15[8,9],ymm3[10],ymm15[11],ymm3[12],ymm15[13,14],ymm3[15] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm6[2],ymm13[3],ymm6[4],ymm13[5,6],ymm6[7],ymm13[8,9],ymm6[10],ymm13[11],ymm6[12],ymm13[13,14],ymm6[15] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm9[1],ymm11[2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 @@ -7637,11 +7641,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rcx) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512F-FAST-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512F-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 993029374b700..57fab33b13fa3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -277,19 +277,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3] ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero @@ -304,8 +304,8 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm2, %xmm9 ; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] @@ -322,8 +322,8 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm3, (%rsi) -; SSE-NEXT: movq %xmm4, (%rdx) +; SSE-NEXT: movq %xmm4, (%rsi) +; SSE-NEXT: movq %xmm3, (%rdx) ; SSE-NEXT: movq %xmm9, (%rcx) ; SSE-NEXT: movq %xmm2, (%r8) ; SSE-NEXT: movq %xmm6, (%r9) @@ -570,118 +570,118 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i16_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 ; SSE-NEXT: movdqa 32(%rdi), %xmm6 ; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3] -; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[3,0] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3] +; SSE-NEXT: pslld $16, %xmm8 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm13[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,0] ; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,0,3] ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm13 -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: por %xmm8, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm12, %xmm8 +; SSE-NEXT: por %xmm14, %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: psrlq $48, %xmm12 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm12[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm12[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,1,0,2] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm11[0],xmm10[1,2,3] +; SSE-NEXT: andps %xmm6, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm10, %xmm12 ; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE-NEXT: psrld $16, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movdqa %xmm14, (%rcx) -; SSE-NEXT: movdqa %xmm8, (%r8) -; SSE-NEXT: movdqa %xmm11, (%r9) -; SSE-NEXT: movdqa %xmm10, (%rax) +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] +; SSE-NEXT: andps %xmm6, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm8, (%rcx) +; SSE-NEXT: movdqa %xmm9, (%r8) +; SSE-NEXT: movdqa %xmm12, (%r9) +; SSE-NEXT: movdqa %xmm6, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf8: @@ -1108,289 +1108,295 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $104, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm8 -; SSE-NEXT: movdqa 128(%rdi), %xmm12 +; SSE-NEXT: subq $136, %rsp +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: pslld $16, %xmm11 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm3[2,3] +; SSE-NEXT: pslld $16, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: psrld $16, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: psrld $16, %xmm13 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pand %xmm13, %xmm9 -; SSE-NEXT: por %xmm15, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[2,0] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: por %xmm13, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm13[1] -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm1[0],xmm15[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm12, %xmm15 -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: pshufd $196, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm15[0],xmm0[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm6, %xmm15 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm6[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: psrlq $48, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm8 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm9, (%rdx) -; SSE-NEXT: movdqa %xmm11, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm3, 16(%r8) -; SSE-NEXT: movdqa %xmm7, (%r8) -; SSE-NEXT: movdqa %xmm1, 16(%r9) -; SSE-NEXT: movdqa %xmm2, (%r9) +; SSE-NEXT: movdqa %xmm7, 16(%r8) +; SSE-NEXT: movdqa %xmm11, (%r8) +; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: movdqa %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm12, 16(%rax) -; SSE-NEXT: movdqa %xmm15, (%rax) -; SSE-NEXT: addq $104, %rsp +; SSE-NEXT: movdqa %xmm10, (%rax) +; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $104, %rsp +; AVX1-ONLY-NEXT: subq $88, %rsp ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 @@ -1398,21 +1404,22 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpslld $16, %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11 @@ -1427,28 +1434,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] @@ -1461,17 +1468,17 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1,2,3],xmm13[4,5],xmm4[6,7] @@ -1483,22 +1490,22 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm9 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] @@ -1510,46 +1517,47 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm9[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm5[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 @@ -1558,14 +1566,14 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm10[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] @@ -1575,44 +1583,44 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $104, %rsp +; AVX1-ONLY-NEXT: addq $88, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1620,11 +1628,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1634,7 +1642,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1661,73 +1669,73 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1735,11 +1743,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1748,7 +1756,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1772,70 +1780,70 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] @@ -1843,11 +1851,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1856,7 +1864,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12 @@ -1880,40 +1888,40 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2209,305 +2217,274 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $488, %rsp # imm = 0x1E8 -; SSE-NEXT: movdqa 304(%rdi), %xmm5 -; SSE-NEXT: movdqa 320(%rdi), %xmm7 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 +; SSE-NEXT: movdqa 304(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 352(%rdi), %xmm4 -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 336(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 256(%rdi), %xmm4 -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm5 +; SSE-NEXT: movdqa 272(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm7[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa 112(%rdi), %xmm11 ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa 160(%rdi), %xmm1 -; SSE-NEXT: movdqa 176(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm11 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,0] +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm5 +; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm9[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE-NEXT: movdqa 144(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm13[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm8[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: psrld $16, %xmm8 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: por %xmm9, %xmm11 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm9[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $132, (%rsp), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm5[0] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm15, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] @@ -2515,313 +2492,341 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: por %xmm6, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm6 ; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: andps %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm12[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm14, %xmm4 +; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: psrlq $48, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrld $16, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm8, %xmm15 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm14, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm7, 16(%r8) -; SSE-NEXT: movdqa %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movdqa %xmm3, 16(%r9) ; SSE-NEXT: movdqa %xmm4, 32(%r9) ; SSE-NEXT: movdqa %xmm5, 48(%r9) -; SSE-NEXT: movdqa %xmm6, (%r9) +; SSE-NEXT: movdqa %xmm11, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm15, 48(%rax) -; SSE-NEXT: movdqa %xmm1, (%rax) -; SSE-NEXT: addq $488, %rsp # imm = 0x1E8 +; SSE-NEXT: movdqa %xmm15, 16(%rax) +; SSE-NEXT: movdqa %xmm9, 32(%rax) +; SSE-NEXT: movdqa %xmm1, 48(%rax) +; SSE-NEXT: movdqa %xmm2, (%rax) +; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $536, %rsp # imm = 0x218 +; AVX1-ONLY-NEXT: subq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 @@ -2833,6 +2838,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 @@ -2842,9 +2848,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,3] @@ -2855,47 +2861,48 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpslld $16, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm0 @@ -2904,200 +2911,200 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm8 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm12[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2,3],xmm8[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3],xmm8[4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm8 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm8 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3],xmm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3],xmm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm9, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm12[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm3[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm15[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm11[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm2[3,4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm9[0],xmm4[0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3106,125 +3113,129 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm15[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, (%rsp), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm7[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm11 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -3241,13 +3252,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: addq $536, %rsp # imm = 0x218 +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3260,43 +3271,44 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm6, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] @@ -3304,235 +3316,237 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3],xmm11[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,0,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm8[1],ymm4[2,3,4,5],ymm8[6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3],xmm12[4,5],xmm4[6],xmm12[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm11[3],xmm8[4,5],xmm11[6],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3],xmm11[4,5],xmm9[6],xmm11[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm15 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3],xmm15[4,5],xmm10[6],xmm15[7] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm13[2],xmm6[3],xmm13[4,5],xmm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2],ymm10[3,4,5,6,7],ymm6[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1,2],xmm12[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm8[4],xmm5[5,6],xmm8[7] -; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1,2],xmm12[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1,2],ymm5[3,4,5,6,7],ymm12[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm12[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-SLOW-NEXT: vzeroupper @@ -3542,227 +3556,231 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3],xmm12[4,5],xmm6[6],xmm12[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, (%rsp), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3],xmm15[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5,6],xmm9[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm13 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5,6],xmm8[7] -; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm7 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm3 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] @@ -3770,30 +3788,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -3804,13 +3822,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-NEXT: vzeroupper @@ -3820,227 +3838,231 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3],xmm12[4,5],xmm6[6],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, (%rsp), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3],xmm15[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5,6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5,6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] @@ -4048,30 +4070,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -4082,13 +4104,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -4096,102 +4118,101 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: pushq %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: subq $136, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm5[2],ymm14[3,4],ymm5[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3],xmm5[4,5],xmm9[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5],xmm9[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3],xmm9[4,5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm13[1],ymm9[2,3,4,5],ymm13[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4,5],xmm2[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm11[1],ymm13[2,3,4,5],ymm11[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] @@ -4200,8 +4221,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2],ymm1[3,4],ymm14[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] @@ -4211,26 +4232,26 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm21 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] @@ -4239,9 +4260,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] @@ -4261,12 +4282,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] @@ -4282,358 +4303,356 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm12[1],ymm0[2,3,4,5],ymm12[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm14[4],xmm10[5],xmm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm30, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm19, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm18, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: popq %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $136, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $136, %rsp -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm11[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm11, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm2[1],ymm11[2,3,4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm2[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3],xmm9[4,5],xmm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3,4,5],ymm0[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2],xmm3[3],xmm11[4,5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm31 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm13[2],ymm14[3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm16 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm16[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5,6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5,6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5,6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm13, %ymm4 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm15[4],xmm9[5],xmm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5],ymm9[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] @@ -4641,18 +4660,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%r9) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $136, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -4661,103 +4680,103 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: pushq %rax -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5],xmm8[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3],xmm9[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm9 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 ; AVX512DQ-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3],xmm8[4,5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3],xmm7[4,5],xmm15[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm3 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -4766,9 +4785,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] @@ -4778,44 +4797,44 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3],ymm12[4],ymm11[5,6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5,6],xmm9[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm17, %zmm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] @@ -4827,93 +4846,93 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm3 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm7 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm13 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm9, %ymm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm9, %ymm5 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm7, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1,2,3],xmm8[4],xmm14[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm14[4],xmm9[5],xmm14[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm4, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7] @@ -4926,11 +4945,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm9 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -4938,272 +4957,270 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: pushq %rax ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3],xmm8[4,5],xmm0[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm10[1],ymm0[2,3,4,5],ymm10[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm26 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm11 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3],xmm14[4,5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3],xmm15[4,5],xmm6[6],xmm15[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm17, %zmm13 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm28 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm30 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm30 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2],xmm0[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm9[2],ymm12[3],ymm9[4],ymm12[5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm17[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3],xmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm17, %zmm18 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm6 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm15 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3],xmm12[4],xmm6[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm13, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm12, %ymm6 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm8, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5],ymm11[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm8 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: popq %rax ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -5313,30 +5330,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1160, %rsp # imm = 0x488 +; SSE-NEXT: subq $1176, %rsp # imm = 0x498 ; SSE-NEXT: movdqa 496(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm8 +; SSE-NEXT: movdqa 512(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa 112(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -5353,14 +5370,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 480(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5368,22 +5385,22 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 544(%rdi), %xmm4 +; SSE-NEXT: movdqa 544(%rdi), %xmm3 ; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -5398,28 +5415,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5437,7 +5456,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 416(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 400(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5447,21 +5466,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 448(%rdi), %xmm4 +; SSE-NEXT: movdqa 448(%rdi), %xmm3 ; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -5476,7 +5495,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5486,21 +5505,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm4 +; SSE-NEXT: movdqa 352(%rdi), %xmm3 ; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -5515,7 +5534,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 704(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 688(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5525,36 +5544,36 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 736(%rdi), %xmm4 -; SSE-NEXT: movdqa 752(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 720(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movdqa 736(%rdi), %xmm3 +; SSE-NEXT: movdqa 752(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 720(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5564,80 +5583,79 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 608(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa 592(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 592(%rdi), %xmm13 ; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 640(%rdi), %xmm7 -; SSE-NEXT: movdqa 656(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0] +; SSE-NEXT: movdqa 640(%rdi), %xmm5 +; SSE-NEXT: movdqa 656(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa 624(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[1,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 624(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] @@ -5650,54 +5668,56 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] @@ -5709,137 +5729,135 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm10[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm15[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm2 @@ -5851,13 +5869,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,1,1,1] @@ -5880,15 +5897,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 @@ -5908,154 +5925,156 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm12, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufhw $231, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] @@ -6066,16 +6085,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 @@ -6087,19 +6106,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm2 @@ -6107,50 +6124,51 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw $231, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -6159,57 +6177,57 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1] ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm15[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -6219,35 +6237,35 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm3 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm12 ; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: andps %xmm14, %xmm3 ; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -6257,56 +6275,55 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm3 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1],mem[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: andps %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: andps %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm2 @@ -6317,55 +6334,54 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm3 @@ -6377,15 +6393,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm3 @@ -6396,134 +6412,135 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: andps %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%r8) +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%r8) +; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%r8) +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps %xmm1, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movdqa %xmm8, 112(%r9) -; SSE-NEXT: movdqa %xmm9, 96(%r9) -; SSE-NEXT: movdqa %xmm11, 80(%r9) +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movdqa %xmm7, 112(%r9) +; SSE-NEXT: movdqa %xmm8, 96(%r9) +; SSE-NEXT: movdqa %xmm15, 80(%r9) ; SSE-NEXT: movdqa %xmm12, 64(%r9) -; SSE-NEXT: movdqa %xmm0, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm15, 112(%rax) -; SSE-NEXT: movdqa %xmm2, 96(%rax) -; SSE-NEXT: movdqa %xmm5, 80(%rax) -; SSE-NEXT: movdqa %xmm13, 64(%rax) -; SSE-NEXT: movdqa %xmm14, 48(%rax) -; SSE-NEXT: movdqa %xmm10, 32(%rax) -; SSE-NEXT: movdqa %xmm6, 16(%rax) -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: addq $1160, %rsp # imm = 0x488 +; SSE-NEXT: movdqa %xmm14, 112(%rax) +; SSE-NEXT: movdqa %xmm3, 96(%rax) +; SSE-NEXT: movdqa %xmm4, 80(%rax) +; SSE-NEXT: movdqa %xmm10, 64(%rax) +; SSE-NEXT: movdqa %xmm11, 48(%rax) +; SSE-NEXT: movdqa %xmm9, 32(%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: addq $1176, %rsp # imm = 0x498 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf64: @@ -6549,27 +6566,28 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 @@ -6579,315 +6597,317 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm15 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm13 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6895,115 +6915,115 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm11[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] @@ -7014,129 +7034,132 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, (%rsp), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3,4],xmm14[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 @@ -7146,9 +7169,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7157,9 +7180,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $243, (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm1[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 @@ -7172,21 +7195,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm13[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -7217,15 +7239,15 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm10 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] @@ -7236,9 +7258,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -7246,18 +7268,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm8[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7271,13 +7293,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] @@ -7337,7 +7360,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -7369,7 +7392,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -7398,10 +7421,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 @@ -7474,7 +7497,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-SLOW-NEXT: subq $1272, %rsp # imm = 0x4F8 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 @@ -7483,123 +7506,118 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm7[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm5[2,3],ymm4[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm5[2,3],ymm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm6[1],ymm13[2,3,4,5],ymm6[6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm10[1],ymm8[2,3,4,5],ymm10[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm2[1],xmm13[2,3],xmm2[4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm15[1],ymm11[2,3,4,5],ymm15[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm11 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0],xmm14[1],xmm11[2,3],xmm14[4],xmm11[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7607,181 +7625,184 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm11[0],mem[1],ymm11[2,3,4,5],mem[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,2,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3],xmm14[4,5],xmm15[6],xmm14[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3,4,5],ymm8[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3],xmm10[4,5],xmm11[6],xmm10[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3],xmm7[4,5],xmm12[6],xmm7[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3],xmm9[4,5],xmm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0],ymm9[1],mem[2,3,4,5],ymm9[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1],xmm9[2],xmm5[3],xmm9[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7],ymm9[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3],xmm0[4,5],xmm11[6],xmm0[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2],xmm10[3],xmm0[4,5],xmm10[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3],xmm10[4,5],xmm7[6],xmm10[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -7790,303 +7811,305 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,3,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm12[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5,6],xmm15[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1,2],xmm6[3],xmm11[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2],xmm8[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7],ymm9[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm14[4],xmm4[5,6],xmm14[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[1,1,1,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2],xmm1[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%r8) +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm13, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rax) -; AVX2-SLOW-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-SLOW-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 @@ -8095,288 +8118,281 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4,5],ymm7[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm13 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3],xmm14[4,5],xmm0[6],xmm14[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7],ymm15[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3],xmm13[4,5],xmm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3],xmm11[4,5],xmm0[6],xmm11[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3],xmm8[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm11[2],xmm6[3],xmm11[4,5],xmm6[6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufhw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = mem[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -8387,256 +8403,253 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1,2,3],xmm12[4],xmm2[5,6],xmm12[7] +; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,3,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3,4,5,6,7],ymm15[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[2,1,0,3] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3],xmm0[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm5 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5],xmm14[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -8665,28 +8678,28 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-NEXT: addq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, (%rax) +; AVX2-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-PERLANE-NEXT: subq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 @@ -8695,548 +8708,538 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4,5],ymm7[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3],xmm14[4,5],xmm0[6],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7],ymm15[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3],xmm13[4,5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3],xmm11[4,5],xmm0[6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3],xmm8[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm11[2],xmm6[3],xmm11[4,5],xmm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1,2,3],xmm12[4],xmm2[5,6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3,4,5,6,7],ymm15[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3],xmm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5],xmm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm15, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -9265,35 +9268,35 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1416, %rsp # imm = 0x588 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,0,3] @@ -9304,12 +9307,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -9320,9 +9323,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] @@ -9345,7 +9348,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 @@ -9367,120 +9370,120 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1,2],xmm13[3],xmm11[4,5],xmm13[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -9489,8 +9492,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] @@ -9500,9 +9503,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] @@ -9513,63 +9516,63 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -9594,8 +9597,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -9612,15 +9615,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm24, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm2 @@ -9646,12 +9650,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3],xmm2[4],xmm12[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] @@ -9664,11 +9668,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] @@ -9684,7 +9688,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] @@ -9695,226 +9699,228 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm24, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm31 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1],xmm2[2,3],xmm10[4],xmm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm19[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm26[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm8, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm11[4],xmm9[5],xmm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1416, %rsp # imm = 0x588 +; AVX512F-ONLY-SLOW-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512F-ONLY-FAST-NEXT: subq $1480, %rsp # imm = 0x5C8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9922,11 +9928,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 @@ -9937,10 +9943,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9951,42 +9957,42 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -9995,21 +10001,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10017,47 +10023,47 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 @@ -10066,401 +10072,401 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm11[0,1,2],ymm15[3,4,5,6,7],ymm11[8,9,10],ymm15[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm11, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2],xmm13[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7],ymm14[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm26 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm24, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm25, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm2 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, (%rsp), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4],xmm1[5],xmm7[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3],xmm8[4],xmm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm11, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm11, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm15 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm1[4],xmm9[5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 @@ -10468,26 +10474,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm15, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 @@ -10515,8 +10521,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload @@ -10525,12 +10531,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1512, %rsp # imm = 0x5E8 +; AVX512F-ONLY-FAST-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -10555,11 +10561,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -10571,8 +10577,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm21 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] @@ -10592,7 +10599,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm2 @@ -10628,51 +10635,51 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm10[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] @@ -10692,20 +10699,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm5 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10716,20 +10723,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,5,5,5,5] @@ -10742,20 +10749,20 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] @@ -10766,41 +10773,41 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2],ymm0[3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 @@ -10808,25 +10815,25 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] @@ -10837,8 +10844,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] @@ -10847,8 +10854,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -10872,11 +10879,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm20, %zmm2 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] @@ -10887,7 +10894,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm21 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 @@ -10899,21 +10906,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3],xmm2[4],xmm12[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm13 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm0 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm27 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] @@ -10929,7 +10936,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] @@ -10938,178 +10945,178 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm4 +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3,4,5],ymm4[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm25 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm25 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm30[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm13, %ymm8 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm8, %zmm0, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm26 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm14, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm8, %ymm11 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3],xmm12[4],xmm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) @@ -11120,16 +11127,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm24 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm21 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r9) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) @@ -11140,7 +11147,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX512DQ-FAST-NEXT: subq $904, %rsp # imm = 0x388 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11148,11 +11155,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 @@ -11160,61 +11167,61 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm3[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm5[1],ymm2[2,3,4,5],ymm5[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4,5],ymm5[6],ymm3[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm27 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 @@ -11225,115 +11232,115 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1,2],xmm13[3],xmm3[4,5],xmm13[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3],xmm2[4,5],xmm10[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm6 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm16, %zmm7 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,5,5,5] @@ -11349,362 +11356,361 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm25 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm11[0,1,2],ymm15[3,4,5,6,7],ymm11[8,9,10],ymm15[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm15 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm20 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2],xmm13[3],xmm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm11 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7],ymm14[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm27, %zmm20 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm28 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm30 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm24 ; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3],xmm4[4],xmm15[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2,3],xmm4[4],xmm14[5],xmm4[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm26 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3],xmm4[4],xmm9[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm4[4],xmm11[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm0 ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm15, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 @@ -11721,18 +11727,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm25 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm4 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%r8) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%r9) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FAST-NEXT: addq $936, %rsp # imm = 0x3A8 +; AVX512DQ-FAST-NEXT: addq $904, %rsp # imm = 0x388 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 8dbd67be7769e..19530e02a99af 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -301,77 +301,77 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: psrlq $16, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: psrlq $16, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] +; SSE-NEXT: pslld $16, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movq %xmm4, (%rsi) -; SSE-NEXT: movq %xmm1, (%rdx) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movq %xmm2, (%rsi) +; SSE-NEXT: movq %xmm0, (%rdx) ; SSE-NEXT: movq %xmm7, (%rcx) ; SSE-NEXT: movq %xmm8, (%r8) -; SSE-NEXT: movq %xmm5, (%r9) -; SSE-NEXT: movq %xmm9, (%rdi) -; SSE-NEXT: movq %xmm0, (%rax) +; SSE-NEXT: movq %xmm6, (%r9) +; SSE-NEXT: movq %xmm10, (%rdi) +; SSE-NEXT: movq %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf4: @@ -734,183 +734,167 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm6[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: andnps %xmm8, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,1,0,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm8[2,2] +; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: andnps %xmm5, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pand %xmm11, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[0,3,2,3] +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; SSE-NEXT: pand %xmm11, %xmm15 +; SSE-NEXT: por %xmm12, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm15, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm12, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm13, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1] +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm14[0],xmm10[1,2,3] -; SSE-NEXT: andps %xmm5, %xmm10 -; SSE-NEXT: orps %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,1] ; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm14, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm11, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm15[0],xmm11[1,2,3] +; SSE-NEXT: andps %xmm3, %xmm11 +; SSE-NEXT: orps %xmm14, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: psrld $16, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm12[0],xmm2[1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm12[0],xmm15[1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: andps %xmm3, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm13, %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: psrlq $16, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE-NEXT: psrlq $48, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm11[0,2] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movdqa %xmm8, (%rsi) -; SSE-NEXT: movdqa %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm10, (%rcx) -; SSE-NEXT: movdqa %xmm5, (%r8) -; SSE-NEXT: movapd %xmm2, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm10[0,2] +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movdqa %xmm5, (%rsi) +; SSE-NEXT: movdqa %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm11, (%rcx) +; SSE-NEXT: movdqa %xmm3, (%r8) +; SSE-NEXT: movapd %xmm13, (%r9) ; SSE-NEXT: movaps %xmm14, (%rdi) -; SSE-NEXT: movapd %xmm0, (%rax) +; SSE-NEXT: movapd %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf8: @@ -1473,339 +1457,339 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: subq $232, %rsp ; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: movdqa 128(%rdi), %xmm6 ; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm13 ; SSE-NEXT: movdqa 176(%rdi), %xmm15 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,3] +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: andnps %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 96(%rdi), %xmm5 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: andnps %xmm0, %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm14, %xmm2 +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm14 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm5 -; SSE-NEXT: orps %xmm3, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm11 +; SSE-NEXT: orps %xmm4, %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: orps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm5, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: andnps %xmm1, %xmm15 +; SSE-NEXT: orps %xmm0, %xmm15 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: psrld $16, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: psrlq $16, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -1814,59 +1798,59 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm9, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movapd %xmm1, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm15, (%rax) -; SSE-NEXT: movaps %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm14, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: addq $216, %rsp +; SSE-NEXT: movapd %xmm11, (%rax) +; SSE-NEXT: movapd %xmm0, 16(%rax) +; SSE-NEXT: addq $232, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $248, %rsp +; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -1876,294 +1860,291 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm10 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[2],xmm2[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm2[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm13 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm6[6],xmm3[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3,4,5],xmm8[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0],xmm0[1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm1[1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1],xmm10[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5],xmm0[6],xmm13[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0],xmm6[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm5[6],xmm15[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm7[1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1,2],xmm10[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5],xmm6[6],xmm4[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5],xmm2[6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm11[1],xmm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm8[1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,2] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm14[1],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1],mem[0],zero +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm4[1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $248, %rsp +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2349,16 +2330,16 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] @@ -2368,8 +2349,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] @@ -2380,7 +2361,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] @@ -2388,8 +2369,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] @@ -2399,7 +2380,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] @@ -2410,7 +2391,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] @@ -2435,30 +2416,30 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm15[1],xmm6[2],xmm15[3],xmm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2],xmm15[3],xmm9[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4,5,6,7],ymm9[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3,4,5,6,7],ymm5[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4] ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,7,0,0,3,7,0] ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7],ymm6[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,4,7,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 @@ -2468,19 +2449,19 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1,2,3,4,5,6,7],ymm12[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1,2,3,4,5,6,7],ymm12[8],ymm5[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,0,3,7,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -2501,7 +2482,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -2523,30 +2504,30 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3,4,5],xmm9[6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7,8,9,10],ymm12[11],ymm11[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] @@ -2554,18 +2535,18 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6,7],ymm9[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] @@ -2574,8 +2555,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm12[1,2,3,4,5,6,7],ymm9[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm12[1,2,3,4,5,6,7],ymm8[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] @@ -2610,10 +2591,10 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6,7],ymm14[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7] @@ -2636,16 +2617,16 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,0,1,14,15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] @@ -2655,18 +2636,18 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm5[1,2,3,4,5,6,7],ymm1[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -2807,17 +2788,17 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] @@ -2829,7 +2810,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rdx) @@ -2837,7 +2818,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa %ymm10, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm11, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper @@ -2849,9 +2830,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] ; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] @@ -2861,8 +2842,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2900,14 +2881,14 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] @@ -2915,65 +2896,65 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] @@ -2984,11 +2965,11 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -3000,9 +2981,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] ; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] @@ -3012,8 +2993,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] @@ -3051,14 +3032,14 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] @@ -3066,65 +3047,65 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u> +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] ; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] @@ -3135,11 +3116,11 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, (%rsi) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -3232,136 +3213,137 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $600, %rsp # imm = 0x258 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm10 -; SSE-NEXT: movdqa 128(%rdi), %xmm11 -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm5 +; SSE-NEXT: movdqa 288(%rdi), %xmm6 +; SSE-NEXT: movdqa 112(%rdi), %xmm13 +; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps 144(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm9 +; SSE-NEXT: movdqa 176(%rdi), %xmm12 ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 320(%rdi), %xmm5 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 272(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps 384(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa 352(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3371,313 +3353,311 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm8 +; SSE-NEXT: orps %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm5[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm6 -; SSE-NEXT: orps %xmm4, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm5[0],xmm14[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm14 +; SSE-NEXT: orps %xmm4, %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,0,1] -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: orps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,0,1] +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3685,60 +3665,89 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: psrlq $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -3746,27 +3755,28 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] @@ -3778,271 +3788,241 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: psrld $16, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm1 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,6,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] +; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, (%rax) -; SSE-NEXT: movaps %xmm6, 48(%rax) -; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rax) +; SSE-NEXT: movaps %xmm6, (%rax) +; SSE-NEXT: movaps %xmm7, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, (%rax) -; SSE-NEXT: movapd %xmm1, 48(%rax) -; SSE-NEXT: movapd %xmm3, 32(%rax) -; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm3, 48(%rax) +; SSE-NEXT: movapd %xmm4, 32(%rax) +; SSE-NEXT: movapd %xmm5, 16(%rax) ; SSE-NEXT: addq $600, %rsp # imm = 0x258 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4054,16 +4034,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 @@ -4074,12 +4055,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm4[2],xmm10[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 @@ -4092,12 +4072,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -4114,22 +4094,21 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm8[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm11[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] @@ -4145,662 +4124,673 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm15[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm10[6],xmm15[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[0],mem[1],xmm13[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm10[0,1,2,3,4,5],mem[6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm8[0],mem[1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm8, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm3, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2,3,4,5],mem[6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm3[6],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm7[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm10, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, (%rsp), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3,4,5],xmm2[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm0[1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm8[0],mem[1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[0,1,2,3,4,5],mem[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5],xmm11[6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm14[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0],xmm14[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5],xmm0[6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm9[6],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm11[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm10[1],mem[0],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm15[1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm12[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = zero,xmm3[1],mem[0],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm13[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-SLOW-LABEL: load_i16_stride7_vf32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-SLOW-LABEL: load_i16_stride7_vf32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4,5],ymm4[6],ymm12[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2],ymm9[3],ymm13[4,5],ymm9[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7,8,9,10],ymm3[11],ymm0[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7,8,9,10,11],ymm4[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -4809,10 +4799,10 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -4821,21 +4811,20 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -4844,105 +4833,107 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm9[2],mem[3,4,5],ymm9[6],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7,8,9,10,11,12,13],ymm4[14],ymm8[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm10[2],ymm8[3,4,5],ymm10[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6],ymm4[7,8,9,10,11,12,13],ymm1[14],ymm4[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2],xmm1[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6],ymm8[7,8,9,10,11,12,13],ymm5[14],ymm8[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] @@ -4950,92 +4941,94 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5],xmm12[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6],ymm9[7,8],ymm7[9,10,11,12,13,14],ymm9[15] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3],xmm5[4],xmm3[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -5059,17 +5052,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-SLOW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 @@ -5099,11 +5092,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,1,u,4,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 @@ -5116,21 +5110,22 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,1,u,5,u,u,u> @@ -5139,19 +5134,20 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] @@ -5159,669 +5155,688 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,6,2,5,3,6,2,5] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,0,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2] ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm15[4],xmm5[5],xmm15[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm8[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm15 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4,5,6,7],ymm15[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6,7],ymm15[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm9[1],xmm15[2],xmm9[3],xmm15[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3,4,5,6,7],ymm5[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm1[3],ymm11[4,5],ymm1[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7],ymm7[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5],mem[6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: addq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: subq $552, %rsp # imm = 0x228 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7,8,9,10],ymm3[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7,8,9,10,11],ymm4[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6,7],ymm15[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm12 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4,5],ymm5[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6],ymm7[7,8,9,10,11,12,13],ymm5[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1],xmm8[2],xmm5[3],xmm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm6[1,2,3,4,5,6,7],ymm4[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3],xmm4[4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7,8],ymm7[9,10,11,12,13,14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $237, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7,8],ymm7[9],ymm6[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm8 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3],xmm6[4],xmm1[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2],ymm11[3],mem[4,5],ymm11[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) @@ -5842,11 +5857,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5855,670 +5870,661 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: pushq %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm18[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3],xmm7[4],xmm2[5],xmm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6,7,8,9,10],ymm2[11],ymm8[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm8, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4],xmm2[5],xmm8[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7,8,9,10],ymm1[11],ymm2[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm13[1],xmm5[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm18[0,1,1,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm18[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm15, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm14[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm13[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm13, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm14, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6],ymm2[7,8,9,10,11,12,13],ymm8[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4,5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7,8],ymm3[9,10,11,12,13,14],ymm4[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2],xmm7[3,4,5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm6[2],ymm9[3,4,5],ymm6[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm10[4],xmm3[5],xmm10[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2,3,4,5,6],ymm11[7,8],ymm2[9,10,11,12,13,14],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3,4,5],xmm10[6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm11[4],xmm2[5],xmm11[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2,3],xmm9[4],xmm12[5],xmm9[6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3,4,5,6,7],ymm14[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7],ymm7[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm4[2],ymm12[3,4,5],ymm4[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6],ymm1[7,8,9,10,11,12,13],ymm7[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm13[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3],ymm9[4,5,6,7,8,9,10],ymm12[11],ymm9[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7],ymm9[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm9, %ymm12, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5],xmm12[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4],ymm12[5,6,7,8,9,10,11],ymm14[12],ymm12[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2],xmm14[3],xmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7],ymm12[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm12, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm7[1,2,3,4,5,6],ymm12[7,8],ymm7[9,10,11,12,13,14],ymm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5,6,7,8,9,10],ymm13[11],ymm10[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm13, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3,4,5],xmm10[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4],ymm10[5,6,7,8,9,10,11],ymm14[12],ymm10[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2],xmm14[3],xmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6],ymm10[7,8],ymm9[9,10,11,12,13,14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm14 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4,5,6,7,8],ymm14[9],ymm12[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6,7,8],ymm9[9],ymm4[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2],xmm8[3],xmm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5],ymm13[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm6, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm24 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm6, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm25, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm26, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: popq %rax ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <1,u,u,u,5,8,12,15> +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,u,u,u,5,8,12,15> ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm2, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm22, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm28[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm13[4],xmm10[5],xmm13[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm9, %ymm10, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm11, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3],xmm14[4],xmm13[5],xmm14[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2],xmm13[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3],xmm15[4],xmm11[5],xmm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5],xmm14[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm10[1],xmm15[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm0[4],xmm11[5],xmm0[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5],xmm0[6],xmm8[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3,4,5,6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm24, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm26, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm22, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm19, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm27, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm28[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm12[4],xmm2[5],xmm12[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3,4,5,6,7],ymm14[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm14, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm10, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm2[2],ymm8[3,4,5],ymm2[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3,4,5],xmm9[6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,11,2,11,12,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7],ymm13[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm15, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm16, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,3,u,0,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7],ymm10[8,9,10,11,12],ymm13[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm12, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm12[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7],ymm1[8,9,10,11,12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2],xmm10[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm12, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm6[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3,4,5],xmm10[6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <3,u,u,u,6,10,13,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7],ymm10[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,8,11,15,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm6, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm27, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm30, %zmm14 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm20, %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm25, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm18, %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm27, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm26, %zmm30, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm12, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -6529,661 +6535,669 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm12 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm11 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm17[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm10, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1,2],xmm2[3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm5[1],xmm15[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7,8,9,10],ymm1[11],ymm2[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm8[2],ymm6[3,4,5],ymm8[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm18 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm11[1],xmm13[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm17[0,1,1,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm5[3],ymm9[4,5],ymm5[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm18 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm19 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm0[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,1,2,1,4,5,6,5] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm13, %xmm2 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm15, %xmm2 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm11, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm31 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm27 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7,8,9,10,11,12,13],ymm3[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6],ymm12[7,8],ymm3[9,10,11,12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7,8],ymm12[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm1[2],ymm10[3,4,5],ymm1[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm24 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3,4,5],xmm10[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4,5],ymm11[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm5[2],ymm14[3,4,5],ymm5[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm17 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6],ymm0[7,8,9,10,11,12,13],ymm10[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7,8,9,10],ymm12[11],ymm3[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm15[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6,7,8,9,10],ymm9[11],ymm12[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2,3,4,5],xmm5[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3,4,5],xmm9[6],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7,8,9,10,11],ymm10[12],ymm9[13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4],ymm3[5,6,7,8,9,10,11],ymm10[12],ymm3[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6],ymm10[7,8],ymm9[9,10,11,12,13,14],ymm10[15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7],ymm6[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm9[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm25, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6,7,8],ymm8[9],ymm5[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1,2,3,4,5,6],ymm10[7,8],ymm2[9,10,11,12,13,14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm15[2],ymm13[3,4,5],ymm15[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5,6,7],ymm10[8,9,10,11,12],ymm4[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm13[2,3],ymm4[4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2],xmm7[3],xmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1,2],ymm7[3,4,5,6,7],ymm1[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm25, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm25, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rdx) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <1,u,u,u,5,8,12,15> +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <1,u,u,u,5,8,12,15> ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm31, %zmm6 +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm1, %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm19, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm27 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm27[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm15[4],xmm12[5],xmm15[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm9, %ymm12, %ymm23 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm30, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,u,u,4,7,11,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm26[0,1,0,2] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm7, %ymm10, %ymm22 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3],xmm11[4],xmm15[5],xmm11[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3,4,5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3,4,5],xmm11[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm11, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm13[2],ymm2[3,4,5],ymm13[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3],xmm11[4],xmm10[5],xmm11[6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4,5,6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm9, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm10 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm26 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2,3],xmm5[4],xmm11[5],xmm5[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0],xmm0[1],xmm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm25 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5],xmm11[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm17 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm21, %zmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm14, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm18, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm18, %zmm17 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5],xmm8[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm19, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm12, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm21, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm10, %zmm0, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm15 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3],xmm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm10, %ymm11, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm27[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm21, %zmm18, %zmm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm11, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm26[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm17, %zmm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm28 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm15, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm10, %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm9, %zmm9 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm31, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,4,7,11,14,u,u,u> ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,3,3,u,0,3,7,u> -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm31, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm9 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7],ymm7[8,9,10,11,12],ymm5[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm7[1],xmm14[2],xmm7[3],xmm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm12, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7],ymm11[8,9,10,11,12],ymm13[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <3,u,u,u,6,10,13,u> +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3],xmm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm13, %zmm13 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,4,8,11,15,u,u,u> +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm26 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm25 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm25, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -7330,169 +7344,169 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1352, %rsp # imm = 0x548 -; SSE-NEXT: movdqa 640(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 624(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: movdqa 128(%rdi), %xmm14 -; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movdqa 640(%rdi), %xmm9 +; SSE-NEXT: movdqa 624(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 128(%rdi), %xmm10 +; SSE-NEXT: movaps 160(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2] ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,0,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 608(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 592(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 560(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 560(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa 576(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 528(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 512(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 496(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 480(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 448(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 464(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 464(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 416(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 384(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -7502,31 +7516,31 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 864(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 864(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: movaps 832(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 832(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -7536,31 +7550,31 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -7570,31 +7584,31 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 752(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 752(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 720(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 720(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -7604,1079 +7618,1066 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm6 -; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm9[0],xmm1[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: orps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[0,1,0,1] -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: orps %xmm9, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm2[0],xmm9[1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: orps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm6 +; SSE-NEXT: orps %xmm2, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: psrld $16, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,0,1] +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3] +; SSE-NEXT: andps %xmm14, %xmm5 +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: andps %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: psrld $16, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: psrld $16, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -8684,10 +8685,30 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] @@ -8763,7 +8784,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] @@ -8773,164 +8794,145 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 112(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm14, 112(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 112(%rax) -; SSE-NEXT: movaps %xmm11, 96(%rax) -; SSE-NEXT: movaps %xmm14, 80(%rax) -; SSE-NEXT: movaps %xmm15, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps %xmm11, 112(%rax) +; SSE-NEXT: movaps %xmm12, 96(%rax) +; SSE-NEXT: movaps %xmm13, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 112(%rax) -; SSE-NEXT: movapd %xmm1, 96(%rax) -; SSE-NEXT: movapd %xmm2, 80(%rax) -; SSE-NEXT: movapd %xmm5, 64(%rax) -; SSE-NEXT: movapd %xmm6, 48(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: movapd %xmm9, (%rax) +; SSE-NEXT: movapd %xmm4, 96(%rax) +; SSE-NEXT: movapd %xmm5, 80(%rax) +; SSE-NEXT: movapd %xmm6, 64(%rax) +; SSE-NEXT: movapd %xmm7, 48(%rax) +; SSE-NEXT: movapd %xmm8, 32(%rax) +; SSE-NEXT: movapd %xmm9, 16(%rax) +; SSE-NEXT: movapd %xmm10, (%rax) ; SSE-NEXT: addq $1352, %rsp # imm = 0x548 ; SSE-NEXT: retq ; @@ -8959,18 +8961,18 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 @@ -8981,10 +8983,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm6[2],zero ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] @@ -8998,7 +9000,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 @@ -9020,15 +9022,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm4[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm15[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9060,9 +9062,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9072,25 +9074,25 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm4[2],zero +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm9[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] @@ -9120,11 +9122,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] @@ -9136,12 +9138,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm4[2],xmm3[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm14[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9179,19 +9180,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7] @@ -9201,6 +9204,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] @@ -9208,14 +9212,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm0[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] @@ -9224,12 +9228,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm5[0],mem[1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] @@ -9244,8 +9248,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 @@ -9257,8 +9261,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm14[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] @@ -9267,25 +9272,26 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm12[0],mem[1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 @@ -9293,44 +9299,43 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm8[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5],xmm0[6],xmm10[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm15[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 @@ -9351,63 +9356,64 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload @@ -9418,146 +9424,230 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsllq $16, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm9, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm11[0,1,2,3,4,5],mem[6],xmm11[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm13[0,1,2,3,4,5],mem[6],xmm13[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm0[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm10, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload @@ -9565,265 +9655,176 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm3[1,2],xmm12[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm10[0],mem[1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm12[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0],xmm1[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm1[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm1[0],mem[1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -9854,12 +9855,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -9867,268 +9868,229 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm11[1],xmm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7] +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm13[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm1[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[0],mem[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5],xmm2[6],xmm8[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm13[1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,xmm14[1],mem[0],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm9[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm3[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = zero,xmm0[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = zero,xmm15[1],mem[0],zero +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload @@ -10138,14 +10100,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm0[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -10167,25 +10129,65 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = zero,xmm0[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10221,24 +10223,24 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) ; AVX1-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-SLOW-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 @@ -10247,397 +10249,394 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm9[2],ymm14[3,4,5],ymm9[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm12[2],ymm8[3,4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm0[2],ymm8[3,4,5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4,5],ymm3[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm12[1],ymm8[2,3,4],ymm12[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm8[2],ymm12[3,4,5],ymm8[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4],xmm1[5],xmm13[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] @@ -10645,2153 +10644,2157 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4,5,6,7],ymm6[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm5[2],ymm2[3,4,5],ymm5[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4,5,6,7],ymm6[8],ymm3[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm15[2],ymm7[3,4,5],ymm15[6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7,8,9,10,11,12,13],ymm3[14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7,8,9,10,11,12,13],ymm4[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $221, (%rsp), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6],xmm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm12[2,3],mem[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7],ymm2[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm9[2,3],ymm14[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5],xmm10[6],xmm14[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm5[3],mem[4,5],ymm5[6],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7,8],ymm5[9,10,11,12,13,14],ymm6[15] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6,7,8],ymm4[9],ymm0[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[3],xmm5[4],xmm0[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[0],xmm0[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7],ymm5[8,9,10,11,12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6,7,8],ymm6[9],ymm3[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4],xmm6[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3],xmm6[4],xmm3[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5,6,7,8],ymm10[9],ymm7[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7,8],ymm7[9],ymm1[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2],ymm6[3],mem[4,5],ymm6[6],mem[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4],xmm2[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm14[1],xmm10[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6,7,8],ymm6[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%rax) -; AVX2-SLOW-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-SLOW-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride7_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,1,u,4,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm10[0,1],mem[2],ymm10[3,4],mem[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,1,u,5,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,5,1,u,4,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,2,5,3,6,2,5] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5],ymm15[6],ymm14[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm13[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4],ymm7[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4,5],mem[6],ymm4[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm0 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm15[2,3],mem[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm10[0,1],mem[2],ymm10[3,4],mem[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2],ymm8[3],mem[4,5],ymm8[6],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6,7],ymm5[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2],ymm15[3],mem[4,5],ymm15[6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7],ymm3[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2,3,4,5,6,7],ymm5[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7],ymm5[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm15, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-FAST-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1512, %rsp # imm = 0x5E8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4,5],ymm14[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4,5],ymm0[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, (%rsp), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm13[0,1,2],mem[3],ymm13[4,5],mem[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm7[2],ymm11[3,4,5],ymm7[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5],xmm13[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm1 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm9[2],ymm3[3,4,5],ymm9[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm5 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm7[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm15[2],ymm12[3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7,8,9,10,11,12,13],ymm4[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm3[1],xmm13[2],xmm3[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $68, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6],ymm7[7,8,9,10,11,12,13],ymm5[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2],xmm5[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6],ymm7[7,8,9,10,11,12,13],ymm5[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2],xmm5[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4,5,6,7],ymm7[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm13[2],ymm8[3,4,5],ymm13[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7,8,9,10,11,12,13],ymm4[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6],ymm4[7,8],ymm1[9,10,11,12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3],xmm4[4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0],ymm1[1,2,3,4,5,6],ymm4[7,8],ymm1[9,10,11,12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7],ymm12[8,9,10,11,12],ymm13[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm12[1,2,3,4,5,6,7],ymm7[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3],xmm5[4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,5],xmm7[6],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2],ymm5[3],mem[4,5],ymm5[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7,8],ymm5[9,10,11,12,13,14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3],xmm3[4],xmm0[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4,5,6,7,8],ymm7[9],ymm2[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6],ymm13[7,8],ymm8[9,10,11,12,13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6,7,8],ymm4[9],ymm1[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm8[1,2,3,4,5,6,7],ymm1[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6,7,8],ymm7[9],ymm4[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1,2],mem[3],ymm15[4,5],mem[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7,8],ymm6[9],ymm5[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1512, %rsp # imm = 0x5E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1800, %rsp # imm = 0x708 +; AVX512F-ONLY-SLOW-NEXT: subq $1864, %rsp # imm = 0x748 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> @@ -12799,8 +12802,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 @@ -12809,114 +12812,111 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm22[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm19[0,1,0,2] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6,7,8,9,10],ymm0[11],ymm5[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm2[2],ymm13[3,4,5],ymm2[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm25[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm15[1],xmm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6,7,8,9,10],ymm8[11],ymm9[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm2[2],ymm11[3,4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] @@ -12925,1808 +12925,1813 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm17 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm22[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,1,2] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm20[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,1,2] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm22[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,1,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm16, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm23, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm20[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5],xmm1[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm18, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3],xmm7[4],xmm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5],xmm5[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm28, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm21, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm27[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm20 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7,8,9,10,11,12,13],ymm4[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm27 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm27[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm13[2,3],ymm2[4,5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3,4,5,6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3,4,5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4,5,6,7,8],ymm0[9],ymm8[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm22, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm27 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm25 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm27[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm22, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3,4,5,6,7,8],ymm0[9],ymm6[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4,5],ymm4[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4,5,6,7],ymm11[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7],ymm1[8,9,10],ymm8[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4,5,6,7],ymm6[8,9,10],ymm11[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6],ymm8[7,8,9,10,11,12,13],ymm0[14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3],ymm9[4,5,6,7,8,9,10],ymm14[11],ymm9[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm9, %ymm14, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm11, %ymm13, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4,5],ymm0[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3,4,5],xmm0[6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm26[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7,8,9,10],ymm15[11],ymm14[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7,8,9,10,11,12,13],ymm0[14],ymm6[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm22[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5,6,7,8,9,10],ymm11[11],ymm6[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1],xmm6[2,3,4,5],xmm11[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm6, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm24 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2],xmm6[3],xmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm6, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4],ymm0[5,6,7,8,9,10,11],ymm13[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4,5,6],ymm12[7,8],ymm0[9,10,11,12,13,14],ymm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3],ymm0[4],ymm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7,8],ymm12[9],ymm0[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm11[1],xmm15[2],xmm11[3],xmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4,5],mem[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7,8],ymm15[9],ymm14[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm14, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5,6,7,8],ymm15[9],ymm8[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm27, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm1[0,1],mem[2],ymm1[3,4,5],mem[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm13, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm20, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm4[0,1],mem[2],ymm4[3,4,5],mem[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm23 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm12 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm18 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm21, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm26, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm30 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm25 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm23, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1800, %rsp # imm = 0x708 +; AVX512F-ONLY-SLOW-NEXT: addq $1864, %rsp # imm = 0x748 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1736, %rsp # imm = 0x6C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: subq $1768, %rsp # imm = 0x6E8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm13 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2,3,4,5],xmm6[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm15, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3,4,5],xmm7[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2,3,4,5],xmm4[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm30[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1],xmm4[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm11, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm11, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm11, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2],xmm3[3],xmm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm20, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm11, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm9, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3,4,5],xmm9[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm20, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1],xmm15[2],xmm8[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm8, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm26[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm19, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm30[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm20, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm16 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,3,3,0,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm13, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm25, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm18, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm31, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4],xmm5[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm14, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3,4,5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3],xmm12[4],xmm3[5],xmm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4,5,6],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,u,u,u,4,8,11,15> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3,4,5,6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm15, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm15, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4,5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm12[2],ymm5[3,4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,u,u,u,5,8,12,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm3[4],xmm12[5],xmm3[6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4,5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,u,u,u,5,8,12,15> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3,4,5],ymm13[6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,11,2,11,12,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm5, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2,3,4,5],xmm2[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm19, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3,4,5],xmm1[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2,3,4,5],xmm14[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5],xmm11[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm18, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm5, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm6[3],ymm13[4,5],ymm6[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm18, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm14, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm17 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2],xmm10[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm26, %zmm10, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,8,11,15,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm13 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm17 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm10, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1,2],ymm7[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm9 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm16, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX512F-ONLY-FAST-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-SLOW-NEXT: subq $1560, %rsp # imm = 0x618 ; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vporq %ymm3, %ymm2, %ymm27 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm4 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm16 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm16[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,1,0,2] ; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7,8,9,10],ymm0[11],ymm2[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5],xmm3[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[0,1,1,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm15[1],xmm10[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm1[1],xmm15[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm11 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7,8,9,10],ymm6[11],ymm7[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm22 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2],ymm12[3,4,5],ymm14[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm20[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6,7,8,9,10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm16[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm10, %xmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm31 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm9 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm20[0,1,1,2] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vporq %ymm8, %ymm7, %ymm30 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm13 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm16[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm18[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm22[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm11[1],ymm9[2,3,4],ymm11[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4],xmm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5],xmm0[6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm22, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3],xmm2[4],xmm7[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm20, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm23, %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3],xmm7[4],xmm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm24, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm15 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm6[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7],ymm7[8,9,10,11,12],ymm6[13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm14[2,3],ymm3[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm16 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm7[1],xmm4[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm18 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 @@ -14734,1249 +14739,1250 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6],ymm3[7,8,9,10,11,12,13],ymm12[14],ymm3[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm1[4],xmm12[5],xmm1[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3,4,5,6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm30 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm30[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6],ymm9[7,8,9,10,11,12,13],ymm13[14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm1[4],xmm12[5],xmm1[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4,5,6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7,8],ymm13[9,10,11,12,13,14],ymm15[15] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3,4,5,6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm13, %xmm29 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm29[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm11[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6,7,8],ymm1[9],ymm12[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm5[2],ymm14[3,4,5],ymm5[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1,2,3,4,5,6],ymm13[7,8],ymm11[9,10,11,12,13,14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3,4,5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4,5,6,7,8],ymm0[9],ymm9[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm4[2],ymm1[3,4,5],ymm4[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm1, %ymm17, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm30 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm31 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm29 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm29[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3,4,5,6,7,8],ymm1[9],ymm7[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5],xmm11[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm1, %ymm17, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm2[3],ymm10[4,5],ymm2[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5],xmm7[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm7 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm30 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3,4,5],xmm1[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm14[2],ymm9[3,4,5],ymm14[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5],xmm11[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm11, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5],xmm7[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm7 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm28 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3,4,5],xmm1[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm10[2],ymm2[3,4,5],ymm10[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm25 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm16 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6],ymm11[7,8,9,10,11,12,13],ymm1[14],ymm11[15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3,4,5],xmm11[6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1],ymm12[2,3],ymm0[4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm22[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm24 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm13 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm11, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm28, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm30 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6],ymm13[7,8,9,10,11,12,13],ymm0[14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm20, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm21[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm29 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7,8,9,10],ymm13[11],ymm12[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5,6,7,8,9,10,11],ymm11[12],ymm7[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm13 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4],ymm7[5,6,7,8,9,10,11],ymm12[12],ymm7[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0,1,2],ymm12[3,4,5,6,7],ymm7[8,9,10],ymm12[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6],ymm12[7,8],ymm10[9,10,11,12,13,14],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7,8],ymm8[9],ymm7[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm9[1],xmm14[2,3,4,5],xmm9[6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm13, %ymm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4,5],ymm14[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5,6,7,8,9,10],ymm11[11],ymm9[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm20 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm13 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm7[1],ymm12[2,3],ymm7[4],ymm12[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm6 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm16 +; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm29 ; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 ; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm23 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm23[0,1,0,2] +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,0,2] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm13[2],ymm6[3,4,5],ymm13[6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm13 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3,4,5],ymm5[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm12[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm3[1],xmm13[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm27 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm7, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5],xmm7[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm26 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[0,1,0,2] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm14[1],xmm7[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm24 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm21 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm21[0,1,0,2] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, (%rsp) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm3[1],xmm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm16 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm10, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm12 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm10, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3,4,5],xmm3[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm10, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm30 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm17, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm10, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3,4,5],xmm8[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm23[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2],xmm4[3],xmm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm20 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2],xmm5[3],xmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3,4,5],xmm7[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm26[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm31 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm22, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm16 {%k1} # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm21[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm20, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,3,3,0,3,7,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7],ymm5[8,9,10,11,12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm31, %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm2, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm22, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,3,3,0,3,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm11, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4],xmm1[5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm11, %xmm7 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3],xmm9[4],xmm2[5],xmm9[6,7] ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 {%k1} # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm30, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,4,7,11,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5,6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm26, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 {%k1} # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3],xmm15[4],xmm5[5],xmm15[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3,4,5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3],xmm6[4],xmm14[5],xmm6[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5,6],xmm15[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3],xmm15[4],xmm8[5],xmm15[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6],xmm9[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,u,u,u,4,8,11,15> +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3],xmm6[4],xmm15[5],xmm6[6],xmm15[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3,4,5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,u,u,5,8,12,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3,4,5,6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4,5],ymm13[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,u,u,u,5,8,12,15> +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm27 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm27 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3,4,5],xmm1[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm21 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6],xmm0[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm19 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2,3,4,5],xmm8[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm31 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm25 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm21 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm29 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm27, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm27 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm27 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm25 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3,4,5],xmm15[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm22, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5],xmm15[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1],xmm8[2],xmm15[3],xmm8[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm12, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm19, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm22, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm8[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3,4,5],xmm8[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm4 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1,2],ymm11[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2],xmm4[3],xmm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm12 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-FAST-NEXT: addq $1288, %rsp # imm = 0x508 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index e88f9e1ebee09..91eba6c880376 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -535,18 +535,18 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm3, %xmm6 +; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm0, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 @@ -563,99 +563,100 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpermi2d %xmm1, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm2[1],xmm10[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512F-SLOW-NEXT: vpermt2d %xmm2, %xmm15, %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpermt2d %xmm3, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rax) ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride8_vf8: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm15 -; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm14, %xmm15 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm15 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm15 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm15, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm3, %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpermi2d %xmm5, %xmm6, %xmm14 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm6 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm2 +; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512F-FAST-NEXT: vpermi2d %xmm4, %xmm5, %xmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm7, %xmm9 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-FAST-NEXT: vpermi2d %xmm4, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm10, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %xmm11, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %xmm3, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rax) -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rax) +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rax) ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride8_vf8: @@ -715,74 +716,75 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm5 ; SSE-NEXT: movdqa 240(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm12 ; SSE-NEXT: movdqa 144(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: movdqa 128(%rdi), %xmm10 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movdqa %xmm2, %xmm15 @@ -792,113 +794,114 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhwd (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm11[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps %xmm8, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) ; SSE-NEXT: movaps %xmm1, (%r9) -; SSE-NEXT: movapd %xmm2, 16(%r9) +; SSE-NEXT: movapd %xmm12, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm4, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm6, (%rax) -; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: movapd %xmm9, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps %xmm14, (%rax) @@ -930,8 +933,8 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -939,25 +942,25 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] @@ -965,114 +968,114 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5],xmm11[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: addq $152, %rsp @@ -1081,98 +1084,97 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $232, %rsp +; AVX2-ONLY-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm11, %xmm0 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm9, %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm3[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5,6],ymm12[7] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1200,47 +1202,48 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,1,3] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm1[1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] @@ -1249,12 +1252,12 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) @@ -1266,10 +1269,10 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $232, %rsp +; AVX2-ONLY-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1287,49 +1290,49 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm30 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm7, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm16 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm16[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm18[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm20 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm20[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm21 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm21[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm23 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm16 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1339,26 +1342,26 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm14, %xmm17, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3] +; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 @@ -1368,76 +1371,76 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm2 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-SLOW-NEXT: vpermi2d %xmm1, %xmm2, %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm16[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm18[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm20[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm21[0,1,1,3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm3 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %ymm10, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm6, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -1445,164 +1448,164 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm7, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm13 +; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm7, %xmm13 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm17 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm17[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm18[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm16 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm20[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm16 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm16[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] +; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm19 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm13, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm19 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm19, %xmm12 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm4, %xmm7 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm18[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm20[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm5, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm5, %xmm15 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm11 +; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm4, %xmm12 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm19, %xmm5 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm18, %xmm5 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, (%rdx) ; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %ymm4, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1702,11 +1705,11 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: subq $696, %rsp # imm = 0x2B8 ; SSE-NEXT: movdqa 496(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 480(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm3 @@ -1720,7 +1723,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] @@ -1743,372 +1746,372 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa 448(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa 384(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: movdqa 304(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movapd %xmm9, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: punpckhwd (%rsp), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm14 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movapd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,3] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -2142,29 +2145,28 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps %xmm9, (%rax) +; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm12, 48(%rax) -; SSE-NEXT: movapd %xmm11, 32(%rax) -; SSE-NEXT: movapd %xmm10, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movapd %xmm13, 48(%rax) +; SSE-NEXT: movapd %xmm12, 32(%rax) +; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 48(%rax) ; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: addq $696, %rsp # imm = 0x2B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $856, %rsp # imm = 0x358 +; AVX1-ONLY-NEXT: subq $872, %rsp # imm = 0x368 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 @@ -2174,8 +2176,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -2192,31 +2194,31 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2226,9 +2228,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 @@ -2247,8 +2249,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -2266,90 +2269,86 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm9[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2360,8 +2359,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] @@ -2370,91 +2369,91 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -2463,104 +2462,106 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm7[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] @@ -2591,19 +2592,19 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $856, %rsp # imm = 0x358 +; AVX1-ONLY-NEXT: addq $872, %rsp # imm = 0x368 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $984, %rsp # imm = 0x3D8 +; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 @@ -2618,15 +2619,14 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 @@ -2635,16 +2635,18 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2652,12 +2654,12 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2685,145 +2687,148 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm10[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm2[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm14 +; AVX2-ONLY-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm12 +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2839,62 +2844,60 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm15 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm7 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm9 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm11, %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2902,244 +2905,239 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm15, %xmm13 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: addq $984, %rsp # imm = 0x3D8 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: subq $616, %rsp # imm = 0x268 ; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm1, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm26 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vpermt2d %xmm3, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm19[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-SLOW-NEXT: movb $-64, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 @@ -3150,118 +3148,122 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm26 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm26[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm25[0,1,0,2] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm27 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm27[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm28 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm9, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm5[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm6, %xmm8, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %xmm8, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -3272,178 +3274,182 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm2, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm20 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm20[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm19 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm21 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm29 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm23 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm9 {%k1} +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vpermi2d %xmm3, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm7 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm26[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm25[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm27[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm28[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm26[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm3[1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm24 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm25 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm16[0],xmm5[1],xmm16[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm28 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm18[2],xmm1[3],xmm18[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm22[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm24[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm9 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm16[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vpermt2d %xmm18, %xmm12, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm12, %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm16[2],xmm5[3],xmm16[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%r8) +; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %xmm8, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -3454,43 +3460,42 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 272(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11 ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3500,25 +3505,26 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: movb $-64, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm11, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -3528,56 +3534,57 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm24[0,1,0,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm25 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm25[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm26 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm26[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm18[0],xmm5[0],xmm18[1],xmm5[1] +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm11 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm13, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm11, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] @@ -3585,245 +3592,246 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm18[2],xmm5[2],xmm18[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm0, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm20[2],xmm21[3],xmm20[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm11, %xmm0, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm16[2],xmm7[3],xmm16[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm31[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm29 = [3,7,3,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm29, %xmm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm0, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7] +; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm29, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm9[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm27, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] -; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm27 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm27[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm23 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm22 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm21 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31 +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm29 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm26 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm30 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm25 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm10[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm20 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm19 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm24[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm26[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [1,5,1,5] -; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm8, %xmm0 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm20[0],xmm6[1],xmm20[1] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm21 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5] +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa %xmm8, %xmm4 -; AVX512F-FAST-NEXT: vpermi2d %xmm16, %xmm9, %xmm4 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm31[0],xmm17[1],xmm31[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vpermi2d %xmm16, %xmm7, %xmm3 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm4 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm20[2],xmm6[3],xmm20[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm4 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm30[2],xmm3[3],xmm30[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm23[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpermi2d %xmm31, %xmm17, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm9[2],xmm16[2],xmm9[3],xmm16[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm18[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm29, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm0, %xmm8 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm29, %xmm9 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm31[2],xmm17[3],xmm31[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpermi2d %xmm22, %xmm17, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm27, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm27, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rsi) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rdx) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rcx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm2, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%r9) -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-FAST-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -3978,41 +3986,42 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1752, %rsp # imm = 0x6D8 +; SSE-NEXT: subq $1800, %rsp # imm = 0x708 ; SSE-NEXT: movdqa 752(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 736(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa 128(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -4046,16 +4055,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 608(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 592(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 560(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4081,13 +4091,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 448(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 432(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4113,17 +4122,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 944(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 928(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4144,29 +4153,30 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 320(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 864(%rdi), %xmm1 @@ -4175,87 +4185,92 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa 816(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa 832(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 816(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa 768(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4263,19 +4278,19 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4283,8 +4298,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4293,137 +4308,140 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movapd %xmm7, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movapd %xmm7, %xmm0 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movapd %xmm8, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4435,43 +4453,42 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4484,32 +4501,33 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -4526,63 +4544,63 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4594,10 +4612,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4615,63 +4631,61 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4680,29 +4694,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -4711,35 +4724,37 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] -; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4749,30 +4764,30 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: unpckhps (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movaps %xmm2, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4783,14 +4798,15 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd $255, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -4805,14 +4821,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4829,14 +4847,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] @@ -4932,8 +4949,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm12, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -4960,7 +4977,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm6, 32(%rax) ; SSE-NEXT: movaps %xmm8, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $1752, %rsp # imm = 0x6D8 +; SSE-NEXT: addq $1800, %rsp # imm = 0x708 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf64: @@ -4970,15 +4987,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -4990,38 +5005,38 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5031,9 +5046,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5046,21 +5061,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm3 @@ -5073,14 +5088,15 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 928(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 896(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5159,16 +5175,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5193,56 +5209,59 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm11[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm9[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] @@ -5267,91 +5286,89 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm4[0],mem[0],xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm11[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5363,8 +5380,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload @@ -5384,78 +5400,78 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm12[3,3,3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm15[0],xmm9[1],xmm15[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -5464,13 +5480,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5481,9 +5495,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5495,7 +5508,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload @@ -5508,7 +5522,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -5542,55 +5556,55 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5599,49 +5613,51 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm10[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm13[0],mem[0],xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] @@ -5659,28 +5675,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm12[0],mem[0],xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -5689,72 +5705,73 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm3[2],xmm15[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -5765,8 +5782,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -5776,58 +5793,58 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm13[2],mem[2],xmm13[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -5897,7 +5914,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2472, %rsp # imm = 0x9A8 +; AVX2-ONLY-NEXT: subq $2408, %rsp # imm = 0x968 ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 @@ -5926,7 +5943,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] @@ -5985,9 +6002,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5995,12 +6012,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6008,13 +6025,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6040,14 +6056,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm0 @@ -6060,7 +6076,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 @@ -6068,84 +6084,84 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm7, %xmm7 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm10[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm6[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm11[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -6160,31 +6176,31 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -6192,15 +6208,15 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -6215,83 +6231,84 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] @@ -6300,17 +6317,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6326,9 +6343,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6337,7 +6354,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -6347,8 +6365,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4],ymm3[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6357,30 +6376,30 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -6389,21 +6408,23 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm6 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -6461,12 +6482,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] @@ -6474,11 +6493,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6497,13 +6515,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -6511,12 +6528,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] @@ -6524,299 +6539,293 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX2-ONLY-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload @@ -6828,7 +6837,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -6879,7 +6888,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -6887,50 +6896,50 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-ONLY-NEXT: addq $2472, %rsp # imm = 0x9A8 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-ONLY-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $2424, %rsp # imm = 0x978 -; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: subq $2408, %rsp # imm = 0x968 +; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6942,7 +6951,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] @@ -6953,12 +6962,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] @@ -6966,16 +6975,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: movb $-64, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6983,22 +6992,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] @@ -7009,357 +7018,351 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm2[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm3[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm3[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 880(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 848(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 832(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa 816(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 800(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa 784(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 768(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 992(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa 992(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa 928(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 624(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 896(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 624(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 592(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 592(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm15[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm15 -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm6, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 560(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 560(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] -; AVX512F-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa 672(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm0[0,1,0,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vmovdqa 672(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2] +; AVX512F-SLOW-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm30[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm22[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm28[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm20[2],xmm10[2],xmm20[3],xmm10[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm23 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-SLOW-NEXT: vpshufd $212, (%rsp), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4],ymm6[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm25[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm30[2],xmm2[2],xmm30[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm22[2],xmm7[2],xmm22[3],xmm7[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm27 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm28[2],xmm21[2],xmm28[3],xmm21[3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm19[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm22 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm24 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-SLOW-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm1[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm25, %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm22, %xmm16, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw $212, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-SLOW-NEXT: vpermt2d %xmm27, %xmm16, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-SLOW-NEXT: vpermt2d %xmm24, %xmm16, %xmm0 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm21, %xmm16, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpermt2d %xmm18, %xmm16, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm15[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7373,21 +7376,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] -; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm10, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] @@ -7430,7 +7433,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -7450,22 +7453,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm30 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7478,292 +7480,304 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm10, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24 -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm26 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm26[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm28 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm22 = mem[0,1,1,3] ; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm29 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vpermi2d %xmm1, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm31 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm31[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,0,4,5,6,4] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpermi2d %xmm2, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm21 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm7 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm30[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm7[1],xmm12[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm12 = xmm12[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; AVX512F-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm20[0],xmm25[0],xmm20[1],xmm25[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm21[2],xmm9[2],xmm21[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm20 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm23[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm30[2],xmm7[2],xmm30[3],xmm7[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm23 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm28[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm25[2,2,2,2] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm14[2],xmm26[2],xmm14[3],xmm26[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm31[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm16[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,1,2,3,7,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm30 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm29 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm2 -; AVX512F-SLOW-NEXT: vpermt2d %xmm22, %xmm16, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm6 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512F-SLOW-NEXT: vpermt2d %xmm23, %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm3[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpermt2d %xmm23, %xmm16, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm2 = xmm13[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %xmm26, %xmm16, %xmm14 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm25[2],xmm10[3],xmm25[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-SLOW-NEXT: vpermt2d %xmm30, %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = xmm3[0,1],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %xmm29, %xmm16, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -7796,13 +7810,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-SLOW-NEXT: addq $2424, %rsp # imm = 0x978 +; AVX512F-SLOW-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2392, %rsp # imm = 0x958 +; AVX512F-FAST-NEXT: subq $2312, %rsp # imm = 0x908 ; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm1 @@ -7812,12 +7826,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm5 ; AVX512F-FAST-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7830,8 +7843,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 @@ -7873,15 +7887,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 @@ -7891,12 +7906,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX512F-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7906,11 +7919,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7920,10 +7933,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -7933,352 +7946,351 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm3 -; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm2 +; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512F-FAST-NEXT: vmovdqa 816(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 800(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 784(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 768(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-FAST-NEXT: vmovdqa 768(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9 ; AVX512F-FAST-NEXT: vmovdqa 992(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 960(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa 928(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 896(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1} ; AVX512F-FAST-NEXT: vmovdqa 624(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa 592(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 +; AVX512F-FAST-NEXT: vpermt2d %xmm11, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 ; AVX512F-FAST-NEXT: vmovdqa 560(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm21 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa 672(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [1,5,1,5] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm11, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm16[0],xmm20[0],xmm16[1],xmm20[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm13, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm31[0],xmm15[0],xmm31[1],xmm15[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm17 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm13, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm25[0],xmm23[0],xmm25[1],xmm23[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm18 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm13 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm13, %xmm9 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm13, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm19 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512F-FAST-NEXT: vpermt2d %xmm26, %xmm13, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm13 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm28[0],xmm19[0],xmm28[1],xmm19[1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512F-FAST-NEXT: vpermt2d %xmm21, %xmm13, %xmm3 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm22[2],xmm30[2],xmm22[3],xmm30[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm27 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm28 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm17[2],xmm15[2],xmm17[3],xmm15[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm15, %xmm4, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 (%rsp), %xmm17 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm19[2],xmm17[2],xmm19[3],xmm17[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm23 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm21 = xmm18[2],xmm23[2],xmm18[3],xmm23[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm23, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm11[2],xmm23[3],xmm11[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm13, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm0 = xmm1[0,1],mem[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm2 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4],ymm2[5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm18, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm23, %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm16, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %xmm7, %xmm18, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm21, %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8297,11 +8309,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -8333,8 +8344,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -8345,13 +8356,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-FAST-NEXT: vpermt2d %xmm8, %xmm5, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-FAST-NEXT: vpermt2d %xmm6, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] @@ -8364,7 +8374,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3] @@ -8394,28 +8404,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm31 ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12 ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] @@ -8423,13 +8430,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -8437,50 +8444,52 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm1 ; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm3, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm30 ; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3] ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm15, %xmm0 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm16[0],xmm9[1],xmm16[1] +; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -8495,14 +8504,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-FAST-NEXT: vpermt2d %xmm29, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm23[0],xmm8[0],xmm23[1],xmm8[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; AVX512F-FAST-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm8 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm13 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] @@ -8511,1379 +8520,1351 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm22[0],xmm26[0],xmm22[1],xmm26[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm25, %xmm15, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm21 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm6 -; AVX512F-FAST-NEXT: vpermi2d %xmm28, %xmm25, %xmm6 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm24[0],xmm2[0],xmm24[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-FAST-NEXT: vpermi2d %xmm28, %xmm30, %xmm6 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm7 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm1 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm16[2],xmm9[3],xmm16[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] -; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm27 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm16[2],xmm17[2],xmm16[3],xmm17[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,6,2,6] +; AVX512F-FAST-NEXT: vpermt2d %xmm29, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm18 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm23[2],xmm1[2],xmm23[3],xmm1[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm1, %xmm0, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm29[2],xmm18[2],xmm29[3],xmm18[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm8[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm21 = xmm22[2],xmm0[2],xmm22[3],xmm0[3] -; AVX512F-FAST-NEXT: vpermt2d %xmm0, %xmm1, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm19[2],xmm17[2],xmm19[3],xmm17[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3] +; AVX512F-FAST-NEXT: vpermt2d %xmm13, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21 ; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm15 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm27 +; AVX512F-FAST-NEXT: vpermi2d %xmm7, %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm22 -; AVX512F-FAST-NEXT: vpermi2d %xmm20, %xmm24, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm23[2],xmm10[2],xmm23[3],xmm10[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX512F-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [3,7,3,7] -; AVX512F-FAST-NEXT: vpermt2d %xmm27, %xmm24, %xmm1 -; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm1 = xmm0[0,1],mem[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm12 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm24, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm8 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm6 -; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm24, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm17, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm17, %xmm4 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm24, %xmm7 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm22[2],xmm20[2],xmm22[3],xmm20[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rsi) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rdx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rdx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rcx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%r8) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%r8) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%r9) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%r9) +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, 64(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: addq $2392, %rsp # imm = 0x958 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-FAST-NEXT: addq $2312, %rsp # imm = 0x908 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i16_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i16_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: movb $-64, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-SLOW-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i16_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: movb $-64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm6, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm4, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm9, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm15, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm10, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm15, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm20, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm17, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm15, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm13, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm21, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm10, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] ; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] ; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm28, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm28, %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <512 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll index b420710a4bbfd..b85824d860c84 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -276,33 +276,33 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps 208(%rdi), %xmm9 ; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps 80(%rdi), %xmm13 ; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rdi), %xmm10 +; SSE-NEXT: movaps 240(%rdi), %xmm11 ; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm12 +; SSE-NEXT: movaps 112(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rdi), %xmm12 ; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm14 +; SSE-NEXT: movaps 176(%rdi), %xmm15 ; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm14[1,3] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm13[1,3] +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm15[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm15[1,3] +; SSE-NEXT: movaps %xmm6, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm12[1,3] -; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3] +; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm11[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm11[1,3] -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm14[1,3] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm13[1,3] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm10[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm10[1,3] -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm11[1,3] +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm9[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm9[1,3] ; SSE-NEXT: movaps %xmm1, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm8[0,2] @@ -312,13 +312,13 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3] -; SSE-NEXT: movaps %xmm10, 96(%rsi) -; SSE-NEXT: movaps %xmm12, 32(%rsi) -; SSE-NEXT: movaps %xmm13, 112(%rsi) -; SSE-NEXT: movaps %xmm15, 48(%rsi) -; SSE-NEXT: movaps %xmm14, 64(%rsi) +; SSE-NEXT: movaps %xmm11, 96(%rsi) +; SSE-NEXT: movaps %xmm14, 32(%rsi) +; SSE-NEXT: movaps %xmm12, 112(%rsi) +; SSE-NEXT: movaps %xmm10, 48(%rsi) +; SSE-NEXT: movaps %xmm15, 64(%rsi) ; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm11, 80(%rsi) +; SSE-NEXT: movaps %xmm13, 80(%rsi) ; SSE-NEXT: movaps %xmm9, 16(%rsi) ; SSE-NEXT: movaps %xmm3, 96(%rdx) ; SSE-NEXT: movaps %xmm5, 112(%rdx) @@ -432,61 +432,61 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i32_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 272(%rdi), %xmm9 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 304(%rdi), %xmm12 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm13[1,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm10[1,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm15[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm15[1,3] +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 272(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm9 +; SSE-NEXT: movaps 304(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm13 +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm3[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm1[1,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm14[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm14[1,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm11[1,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm7[1,3] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm12[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,3],xmm4[1,3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm2[1,3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm14[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm14[1,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm11[1,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm9[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm12[1,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 352(%rdi), %xmm15 @@ -496,46 +496,46 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,3],xmm0[1,3] ; SSE-NEXT: movaps 336(%rdi), %xmm0 ; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] ; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movaps 416(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,3],xmm0[1,3] ; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movaps 384(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm0[1,3] -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps 480(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3] +; SSE-NEXT: movaps 496(%rdi), %xmm0 +; SSE-NEXT: movaps 480(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm0[1,3] +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm1[1,3] +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3] -; SSE-NEXT: movaps 464(%rdi), %xmm3 -; SSE-NEXT: movaps 448(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,3],xmm2[1,3] -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm2[1,3] -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps %xmm12, 160(%rsi) +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm1[1,3] +; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm0[1,3] +; SSE-NEXT: movaps %xmm2, 224(%rsi) +; SSE-NEXT: movaps %xmm11, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm5, 240(%rsi) +; SSE-NEXT: movaps %xmm6, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -547,17 +547,17 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 128(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps %xmm14, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 224(%rdx) -; SSE-NEXT: movaps %xmm4, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm9, 208(%rdx) +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 224(%rdx) +; SSE-NEXT: movaps %xmm7, 240(%rdx) +; SSE-NEXT: movaps %xmm9, 192(%rdx) +; SSE-NEXT: movaps %xmm12, 208(%rdx) ; SSE-NEXT: movaps %xmm13, 160(%rdx) ; SSE-NEXT: movaps %xmm15, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -576,141 +576,141 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2],ymm8[0,2],ymm9[4,6],ymm8[4,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2],ymm8[0,2],ymm10[4,6],ymm8[4,6] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,2],ymm12[0,2],ymm7[4,6],ymm12[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm7[0,2],ymm11[4,6],ymm7[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,2],ymm14[0,2],ymm5[4,6],ymm14[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3],ymm12[1,3],ymm7[5,7],ymm12[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3],ymm14[1,3],ymm5[5,7],ymm14[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,3],ymm8[1,3],ymm9[5,7],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,2],ymm14[0,2],ymm2[4,6],ymm14[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm14[1,3],ymm2[5,7],ymm14[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,2],ymm12[0,2],ymm4[4,6],ymm12[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm12[1,3],ymm4[5,7],ymm12[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,2],ymm11[0,2],ymm6[4,6],ymm11[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm11[1,3],ymm6[5,7],ymm11[5,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,3],ymm7[1,3],ymm11[5,7],ymm7[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,3],ymm8[1,3],ymm10[5,7],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,2],ymm15[0,2],ymm4[4,6],ymm15[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm15[1,3],ymm4[5,7],ymm15[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2],ymm13[0,2],ymm6[4,6],ymm13[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm13[1,3],ymm6[5,7],ymm13[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm11[0,2],ymm9[4,6],ymm11[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3],ymm11[1,3],ymm9[5,7],ymm11[5,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm11[0,2],ymm1[4,6],ymm11[4,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm11[1,3],ymm1[5,7],ymm11[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride2_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,2],ymm2[0,2],ymm13[4,6],ymm2[4,6] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[0,2],ymm4[0,2],ymm7[4,6],ymm4[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,3],ymm4[1,3],ymm7[5,7],ymm4[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,2],ymm5[0,2],ymm8[4,6],ymm5[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3],ymm5[1,3],ymm8[5,7],ymm5[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2],ymm6[0,2],ymm9[4,6],ymm6[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,3],ymm6[1,3],ymm9[5,7],ymm6[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm2[1,3],ymm1[5,7],ymm2[5,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,3],ymm2[1,3],ymm3[5,7],ymm2[5,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,3],ymm2[1,3],ymm13[5,7],ymm2[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm5[0,2],ymm15[4,6],ymm5[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,3],ymm5[1,3],ymm15[5,7],ymm5[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2],ymm6[0,2],ymm14[4,6],ymm6[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,3],ymm6[1,3],ymm14[5,7],ymm6[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm10[0,2],ymm12[4,6],ymm10[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,3],ymm10[1,3],ymm12[5,7],ymm10[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,2],ymm8[0,2],ymm11[4,6],ymm8[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,3],ymm8[1,3],ymm11[5,7],ymm8[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2],ymm7[0,2],ymm9[4,6],ymm7[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,3],ymm7[1,3],ymm9[5,7],ymm7[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 2829c15ed8256..f6a81c61b87a7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -213,40 +213,40 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 80(%rdi), %xmm0 -; SSE-NEXT: movaps 64(%rdi), %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,2] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,3] -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm7, 16(%rsi) +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps %xmm6, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf8: @@ -407,72 +407,73 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 160(%rdi), %xmm9 ; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm15 ; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 64(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps 64(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] ; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm7, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movaps %xmm11, %xmm2 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm15, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[0,0] +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: movaps %xmm4, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm9[0,0] ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm8[0,0] ; SSE-NEXT: movaps %xmm8, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm10[0,3] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] @@ -483,13 +484,13 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps %xmm6, 48(%rdx) ; SSE-NEXT: movaps %xmm7, (%rdx) ; SSE-NEXT: movaps %xmm11, 16(%rdx) ; SSE-NEXT: movaps %xmm4, 32(%rcx) ; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: retq ; @@ -731,238 +732,240 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 -; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 208(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $392, %rsp # imm = 0x188 +; SSE-NEXT: movaps 192(%rdi), %xmm4 +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps 208(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm6 ; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm2 ; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm11 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm3, %xmm13 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 288(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2] +; SSE-NEXT: movaps 96(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 304(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm14 ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm0[0,0] ; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm13, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm15[0,0] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm12[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 16(%rsi) -; SSE-NEXT: movaps %xmm2, 96(%rdx) -; SSE-NEXT: movaps %xmm10, 32(%rdx) -; SSE-NEXT: movaps %xmm14, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 96(%rdx) +; SSE-NEXT: movaps %xmm11, 32(%rdx) +; SSE-NEXT: movaps %xmm7, 112(%rdx) ; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm11, 64(%rdx) +; SSE-NEXT: movaps %xmm10, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm14, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps %xmm0, 96(%rcx) ; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps %xmm13, 64(%rcx) -; SSE-NEXT: movaps %xmm7, 80(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps %xmm5, 48(%rcx) -; SSE-NEXT: movaps %xmm6, (%rcx) -; SSE-NEXT: movaps %xmm4, 16(%rcx) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: movaps %xmm15, 64(%rcx) +; SSE-NEXT: movaps %xmm13, 80(%rcx) +; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: movaps %xmm6, 48(%rcx) +; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm8, 16(%rcx) +; SSE-NEXT: addq $392, %rsp # imm = 0x188 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -970,233 +973,236 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm10[2,0],ymm8[5,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm4[1,3],ymm1[6,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm10[3,0],ymm8[6,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,0],ymm0[2,0],ymm10[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm11, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm13[0,3],ymm8[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm12[3,0],ymm2[6,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0],ymm0[2,0],ymm12[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm9[0,3],ymm7[6,4],ymm9[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps $196, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm3[2,0],ymm10[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,3],ymm1[4,5],ymm4[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $104, %rsp -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $136, %rsp +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm11, %ymm11 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: addq $104, %rsp +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: addq $136, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1205,81 +1211,81 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: subq $104, %rsp ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm12 ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [0,1,0,3,0,1,4,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm9, %ymm11 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm13, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload @@ -1289,119 +1295,123 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm8, 96(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $136, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: addq $136, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1454,47 +1464,47 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1064, %rsp # imm = 0x428 +; SSE-NEXT: subq $1112, %rsp # imm = 0x458 ; SSE-NEXT: movaps 624(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 656(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 432(%rdi), %xmm4 +; SSE-NEXT: movaps 656(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm6 +; SSE-NEXT: movaps 640(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 432(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 464(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 448(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rdi), %xmm11 -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm9 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm7, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1503,227 +1513,231 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 192(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 416(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 400(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 384(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 592(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 576(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm6 -; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 576(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm10 +; SSE-NEXT: movaps 160(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 560(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 560(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 528(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 528(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 752(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 736(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 720(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[1,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 720(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[1,0] +; SSE-NEXT: movaps 96(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps 304(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm13[1,0] +; SSE-NEXT: movaps 304(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[1,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 512(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[1,0] -; SSE-NEXT: movaps 480(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 688(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[1,0] -; SSE-NEXT: movaps 672(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] +; SSE-NEXT: movaps 480(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 704(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 688(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[1,0] +; SSE-NEXT: movaps 672(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] -; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm14[0,0] -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm15[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm11[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[0,0] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm2, %xmm11 ; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -1732,13 +1746,12 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload @@ -1746,11 +1759,18 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[0,3] +; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -1758,11 +1778,6 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -1770,13 +1785,12 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] @@ -1788,16 +1802,15 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] @@ -1815,8 +1828,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -1856,8 +1868,9 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm4, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm12, 224(%rdx) -; SSE-NEXT: movaps %xmm14, 240(%rdx) +; SSE-NEXT: movaps %xmm14, 224(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -1895,22 +1908,21 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm7, 144(%rcx) ; SSE-NEXT: movaps %xmm8, 128(%rcx) ; SSE-NEXT: movaps %xmm9, 112(%rcx) -; SSE-NEXT: movaps %xmm15, 96(%rcx) -; SSE-NEXT: movaps %xmm10, 80(%rcx) -; SSE-NEXT: movaps %xmm11, 64(%rcx) +; SSE-NEXT: movaps %xmm10, 96(%rcx) +; SSE-NEXT: movaps %xmm11, 80(%rcx) +; SSE-NEXT: movaps %xmm12, 64(%rcx) ; SSE-NEXT: movaps %xmm13, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm15, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: addq $1064, %rsp # imm = 0x428 +; SSE-NEXT: addq $1112, %rsp # imm = 0x458 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1416, %rsp # imm = 0x588 +; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1942,8 +1954,8 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1952,8 +1964,8 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm5[2,0],ymm2[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1967,87 +1979,87 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[2,0],ymm1[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm9[1,3],ymm0[6,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm11[2,0],ymm2[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2057,71 +2069,68 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,0],ymm0[2,0],ymm5[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[3,0],ymm0[6,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[2,0],ymm3[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[3,0],ymm0[6,4],ymm13[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0],ymm0[2,0],ymm13[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm3[0,3],ymm2[5,6],ymm3[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm12[3,0],ymm4[6,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0],ymm1[2,0],ymm12[4,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,2],ymm4[0,3],ymm5[5,6],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm7[3,0],ymm6[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0],ymm2[2,0],ymm7[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm6[0,3],ymm15[5,6],ymm6[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,0],ymm7[3,0],ymm12[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[0,0],ymm5[2,0],ymm7[4,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2],ymm5[3,4],ymm11[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm7[0,3],ymm14[5,6],ymm7[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2134,831 +2143,831 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,0],ymm14[2,0],ymm3[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[0,3],ymm0[6,4],ymm8[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,0],ymm0[2,0],ymm13[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[0,3],ymm0[6,4],ymm8[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1],mem[0,3],ymm8[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[0,3],ymm0[6,4],ymm9[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm1[2,0],ymm9[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[0,3],ymm1[6,4],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm10[0,3],ymm2[6,4],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[0,3],ymm4[6,4],ymm6[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1],mem[0,3],ymm6[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm5[2],ymm11[3,4],ymm5[5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm11[0,3],ymm5[6,4],ymm11[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm12[0,3],ymm6[4,5],ymm12[4,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1416, %rsp # imm = 0x588 +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm10 +; AVX2-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm13, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm13, %ymm15 -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm13[0,1],mem[2],ymm13[3,4],mem[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm12, %ymm6 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm14, %ymm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm11 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm15 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $219, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm9, %ymm13 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = [0,1,0,3,0,1,4,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [0,1,0,3,0,1,4,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm15, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) ; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm14, 32(%rcx) -; AVX2-FAST-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm13, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm13, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm13[0,1],mem[2],ymm13[3,4],mem[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 980312630759a..eba201399b0b9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -420,109 +420,106 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm1 -; SSE-NEXT: movaps 144(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm14 -; SSE-NEXT: movaps 176(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movaps 208(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 240(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm11 +; SSE-NEXT: movaps 112(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps 192(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: movaps %xmm13, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] -; SSE-NEXT: movaps %xmm14, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm7[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm13, (%rdx) -; SSE-NEXT: movaps %xmm6, 32(%rdx) -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rcx) -; SSE-NEXT: movaps %xmm8, 32(%rcx) +; SSE-NEXT: movaps %xmm15, 48(%rdx) +; SSE-NEXT: movaps %xmm14, (%rdx) +; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) +; SSE-NEXT: movaps %xmm6, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm12, 48(%r8) -; SSE-NEXT: movaps %xmm14, 32(%r8) -; SSE-NEXT: movaps %xmm10, 16(%r8) -; SSE-NEXT: movaps %xmm11, (%r8) -; SSE-NEXT: addq $40, %rsp +; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps %xmm10, 48(%r8) +; SSE-NEXT: movaps %xmm3, 32(%r8) +; SSE-NEXT: movaps %xmm11, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf16: @@ -702,20 +699,20 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] ; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -743,7 +740,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm5 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [3,7,3,7] ; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload @@ -753,7 +750,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm9, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -822,47 +819,47 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: movaps 272(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 304(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm10 +; SSE-NEXT: movaps 288(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 336(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps 320(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm4 ; SSE-NEXT: movaps 96(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movaps 256(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -874,8 +871,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 208(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -894,189 +891,185 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 464(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 144(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 432(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 416(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 400(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdi), %xmm11 ; SSE-NEXT: movaps 48(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: unpckhps (%rsp), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: movaps %xmm15, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm7, 96(%rdx) +; SSE-NEXT: movaps %xmm14, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 64(%rdx) +; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm3, 96(%rcx) +; SSE-NEXT: movaps %xmm9, 32(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%rcx) +; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps %xmm13, 64(%rcx) +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps %xmm8, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps %xmm0, 112(%r8) +; SSE-NEXT: movaps %xmm12, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm7, 96(%rdx) -; SSE-NEXT: movaps %xmm11, 32(%rdx) -; SSE-NEXT: movaps %xmm15, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps %xmm6, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps %xmm4, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm14, (%rcx) -; SSE-NEXT: movaps %xmm1, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm9, 96(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps %xmm13, 32(%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm15, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r8) -; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf32: @@ -1087,25 +1080,25 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1131,31 +1124,32 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1163,91 +1157,92 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm11[1,0],ymm5[5,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[4],ymm10[4],ymm15[5],ymm10[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[1],xmm11[1],zero,zero +; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm9[1],zero,zero ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm9[0],mem[0],xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm14[1,0],ymm12[5,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm3[1,0],ymm1[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm15[2,3],ymm1[6,4],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm14[2,3],ymm1[6,4],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm4[0],mem[0],xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[4],ymm7[4],ymm12[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[1,0],ymm1[5,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm8[2,3],ymm1[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm12[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[6],ymm5[6],ymm10[7],ymm5[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm9[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = zero,zero,xmm9[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] @@ -1255,41 +1250,39 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm4[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = zero,zero,xmm5[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm2[2],xmm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm7[2],xmm12[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm7[2],xmm13[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -1298,69 +1291,70 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm8[3,0],ymm5[7,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[2,3],ymm4[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[3,0],xmm11[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm8[3,0],ymm3[7,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[2,3],ymm3[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,0],xmm9[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm4[2,3],ymm5[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,0],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) @@ -1370,71 +1364,70 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm12 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm13 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 @@ -1443,143 +1436,146 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm7, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm1, %ymm5 +; AVX2-ONLY-NEXT: vmovaps %ymm13, %ymm10 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps %ymm12, %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm13 -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm13 +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm10, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3,7,3,7,3,7,3,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm1, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1596,17 +1592,17 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1687,81 +1683,78 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride4_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8 -; SSE-NEXT: movaps 144(%rdi), %xmm4 -; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps 144(%rdi), %xmm14 +; SSE-NEXT: movaps 176(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm12 +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movaps 128(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm12 ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 336(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movaps 336(%rdi), %xmm13 +; SSE-NEXT: movaps 320(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 272(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1782,26 +1775,26 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 432(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 432(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 416(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 400(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 384(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 624(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 592(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 576(%rdi), %xmm1 @@ -1847,11 +1840,12 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 672(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 656(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 656(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 640(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1874,8 +1868,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 800(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 800(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 784(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1889,316 +1883,312 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 992(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 976(%rdi), %xmm1 +; SSE-NEXT: movaps 976(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 960(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 944(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 928(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 912(%rdi), %xmm0 +; SSE-NEXT: movaps 928(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 912(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 896(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: movaps 32(%rdi), %xmm6 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps 16(%rdi), %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: unpckhps (%rsp), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm4[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movaps %xmm9, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm5, 224(%rdx) -; SSE-NEXT: movaps %xmm14, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 192(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 144(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps %xmm10, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movaps %xmm3, 240(%rcx) -; SSE-NEXT: movaps %xmm8, 224(%rcx) -; SSE-NEXT: movaps %xmm13, 208(%rcx) -; SSE-NEXT: movaps %xmm0, 192(%rcx) -; SSE-NEXT: movaps %xmm1, 176(%rcx) -; SSE-NEXT: movaps %xmm4, 160(%rcx) -; SSE-NEXT: movaps %xmm7, 144(%rcx) -; SSE-NEXT: movaps %xmm11, 128(%rcx) -; SSE-NEXT: movaps %xmm12, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 224(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 240(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 208(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 160(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 128(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 144(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 240(%rcx) +; SSE-NEXT: movaps %xmm12, 224(%rcx) +; SSE-NEXT: movaps %xmm1, 208(%rcx) +; SSE-NEXT: movaps %xmm2, 192(%rcx) +; SSE-NEXT: movaps %xmm3, 176(%rcx) +; SSE-NEXT: movaps %xmm4, 160(%rcx) +; SSE-NEXT: movaps %xmm5, 144(%rcx) +; SSE-NEXT: movaps %xmm6, 128(%rcx) +; SSE-NEXT: movaps %xmm10, 112(%rcx) +; SSE-NEXT: movaps %xmm15, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2209,10 +2199,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm13, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r8) -; SSE-NEXT: movaps %xmm9, 224(%r8) +; SSE-NEXT: movaps %xmm7, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2223,11 +2213,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 160(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%r8) +; SSE-NEXT: movaps %xmm11, 128(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) @@ -2239,41 +2228,43 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm15, (%r8) +; SSE-NEXT: movaps %xmm9, (%r8) ; SSE-NEXT: addq $1224, %rsp # imm = 0x4C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2200, %rsp # imm = 0x898 +; AVX1-ONLY-NEXT: subq $2184, %rsp # imm = 0x888 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[4],ymm9[4],ymm5[5],ymm9[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5] ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2295,11 +2286,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -2319,11 +2310,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -2391,11 +2382,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] @@ -2409,8 +2400,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 @@ -2419,11 +2410,11 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,0],ymm13[4,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 @@ -2433,40 +2424,40 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm8[0],ymm14[1],ymm8[1],ymm14[4],ymm8[4],ymm14[5],ymm8[5] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm7[1,0],ymm9[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm7[1,0],ymm14[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[1],xmm5[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm5[1,0],ymm10[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm5[1,0],ymm9[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm15[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm13[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2479,10 +2470,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2495,10 +2486,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2511,10 +2502,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2527,10 +2518,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2543,10 +2534,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2559,61 +2550,61 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm9[2],ymm14[3],ymm9[3],ymm14[6],ymm9[6],ymm14[7],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm3[2],xmm11[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm7[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm6[2],xmm12[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm11[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm2[2],xmm3[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[6],ymm11[6],ymm7[7],ymm11[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2626,10 +2617,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2642,10 +2633,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2658,10 +2649,10 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2674,30 +2665,28 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm14[3,0],ymm1[7,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[3,0],ymm1[7,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm10[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] @@ -2705,29 +2694,30 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,0],xmm14[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm11[3,0],ymm5[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[3,0],ymm5[3,0],ymm9[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,0],ymm1[2,3],ymm13[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,0],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[6],ymm6[6],ymm8[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm7[3,0],ymm11[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,0],ymm10[2,3],ymm11[6,4],ymm10[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm1[2,3],ymm12[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,0],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[2,0],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] @@ -2739,9 +2729,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,0],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,0],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload @@ -2840,461 +2830,451 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: addq $2200, %rsp # imm = 0x898 +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) +; AVX1-ONLY-NEXT: addq $2184, %rsp # imm = 0x888 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1960, %rsp # imm = 0x7A8 +; AVX2-ONLY-NEXT: subq $1992, %rsp # imm = 0x7C8 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm13 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm14 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm13 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] ; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm15 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm12 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,7,3,7] ; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) @@ -3332,10 +3312,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3345,15 +3323,14 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: addq $1960, %rsp # imm = 0x7A8 +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: addq $1992, %rsp # imm = 0x7C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 853ea0fb70b0b..f658848e511c9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -588,212 +588,206 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i32_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $296, %rsp # imm = 0x128 -; SSE-NEXT: movdqa 288(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm4 -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: subq $312, %rsp # imm = 0x138 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm3 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 +; SSE-NEXT: movdqa 256(%rdi), %xmm8 ; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 ; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa 96(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm15[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movaps %xmm11, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -808,126 +802,123 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movapd %xmm13, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movapd %xmm9, 16(%rcx) +; SSE-NEXT: movapd %xmm12, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movapd %xmm0, 16(%r8) +; SSE-NEXT: movapd %xmm1, 16(%r8) ; SSE-NEXT: movapd %xmm2, 48(%r8) ; SSE-NEXT: movapd %xmm6, (%r8) -; SSE-NEXT: movapd %xmm7, 32(%r8) +; SSE-NEXT: movapd %xmm8, 32(%r8) ; SSE-NEXT: movapd %xmm14, 16(%r9) ; SSE-NEXT: movapd %xmm15, 48(%r9) -; SSE-NEXT: movapd %xmm12, (%r9) -; SSE-NEXT: movapd %xmm1, 32(%r9) -; SSE-NEXT: addq $296, %rsp # imm = 0x128 +; SSE-NEXT: movapd %xmm13, (%r9) +; SSE-NEXT: movapd %xmm0, 32(%r9) +; SSE-NEXT: addq $312, %rsp # imm = 0x138 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1],ymm5[1,3],ymm3[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[3,0],ymm3[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm5[1,3],ymm4[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[3,0],ymm1[6,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm5[2,0],ymm14[7,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,1],ymm1[6,4],ymm5[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm2[2,0],ymm0[7,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm2[2,0],ymm3[7,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,1],ymm0[6,4],ymm2[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm7[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm11[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm5[2,2],ymm14[6,4],ymm5[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm5[2,2],ymm13[6,4],ymm5[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm2[3,0],ymm11[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] @@ -936,17 +927,17 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm12[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] @@ -956,18 +947,18 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm14, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: addq $168, %rsp +; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -975,99 +966,99 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,5,2,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm7, %ymm10 +; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4],ymm12[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <1,6,3,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4],ymm10[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <1,6,3,u> +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,2,7,0,5,2,7,0] +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm14, %ymm7 -; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,7,4,u> +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm7, %ymm13 +; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3,4,5,6],ymm14[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm14[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,5,0,5,0,5,0,5] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm14, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [1,6,1,6,1,6,1,6] -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm10 = [1,6,1,6,1,6,1,6] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm10, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm6[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm15, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <4,1,6,u> -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <4,1,6,u> +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7] ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm6, %ymm3 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm2[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm8, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm8, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1077,10 +1068,10 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-ONLY-NEXT: popq %rax @@ -1226,66 +1217,63 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 448(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm11 +; SSE-NEXT: movdqa 400(%rdi), %xmm10 ; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movdqa 128(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm12 ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm7 -; SSE-NEXT: movdqa 336(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1303,25 +1291,26 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 560(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 592(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm6 +; SSE-NEXT: movdqa 160(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1345,284 +1334,292 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 144(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 544(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm13[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1630,112 +1627,109 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: punpckhdq (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 16(%rcx) -; SSE-NEXT: movapd %xmm6, 112(%r8) -; SSE-NEXT: movapd %xmm8, 96(%r8) +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 16(%rcx) +; SSE-NEXT: movapd %xmm7, 112(%r8) +; SSE-NEXT: movapd %xmm9, 96(%r8) ; SSE-NEXT: movapd %xmm10, 80(%r8) -; SSE-NEXT: movapd %xmm11, 64(%r8) -; SSE-NEXT: movapd %xmm15, 48(%r8) -; SSE-NEXT: movapd %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movapd %xmm12, 64(%r8) +; SSE-NEXT: movapd %xmm13, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, (%r8) ; SSE-NEXT: movapd %xmm0, 112(%r9) ; SSE-NEXT: movapd %xmm1, 96(%r9) ; SSE-NEXT: movapd %xmm2, 80(%r9) ; SSE-NEXT: movapd %xmm3, 64(%r9) ; SSE-NEXT: movapd %xmm4, 48(%r9) ; SSE-NEXT: movapd %xmm5, 32(%r9) -; SSE-NEXT: movapd %xmm7, 16(%r9) -; SSE-NEXT: movapd %xmm9, (%r9) +; SSE-NEXT: movapd %xmm6, 16(%r9) +; SSE-NEXT: movapd %xmm8, (%r9) ; SSE-NEXT: addq $904, %rsp # imm = 0x388 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: subq $952, %rsp # imm = 0x3B8 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] @@ -1744,82 +1738,84 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] @@ -1829,26 +1825,26 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm11[2,3],ymm5[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1861,273 +1857,268 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm7[2,0],ymm13[7,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,1],ymm0[6,4],ymm7[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm15[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm8[2,0],ymm13[7,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm7[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm10[0,0],ymm13[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[0,0],ymm5[5,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm6[2,0],ymm14[7,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,1],ymm0[6,4],ymm6[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm14[2,0],ymm12[7,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[2,1],ymm0[6,4],ymm14[6,5] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm3[0,0],ymm14[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm2[0,0],ymm7[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm11[2,0],ymm0[7,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,1],ymm0[6,4],ymm11[6,5] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm6[0,0],ymm11[5,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm13[0,0],ymm9[5,4],ymm13[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm12[2,0],ymm0[7,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm12[2,1],ymm0[6,4],ymm12[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm5[0,0],ymm8[5,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2],xmm6[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm7[3,0],ymm1[4,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm8[3,0],ymm4[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm10[1,0],ymm13[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm3[1,0],ymm5[6,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm9[3,0],ymm2[4,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm9[2,2],ymm15[6,4],ymm9[6,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm14[3,0],ymm3[4,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm14[2,2],ymm15[6,4],ymm14[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm3[1,0],ymm14[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,0],ymm2[1,0],ymm7[6,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm4[3,0],ymm3[4,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm4[2,2],ymm15[6,4],ymm4[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm11[3,0],ymm2[4,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm11[2,2],ymm15[6,4],ymm11[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm6[1,0],ymm11[6,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm13[1,0],ymm9[6,4],ymm13[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm12[3,0],ymm4[4,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm12[2,2],ymm15[6,4],ymm12[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm5[1,0],ymm8[6,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],mem[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm2[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r9) -; AVX1-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: addq $952, %rsp # imm = 0x3B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride5_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm11[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm12 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0] ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -2135,173 +2126,174 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm4[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm14 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm8 ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm13 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm9 = [1,6,1,6,1,6,1,6] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm9, %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,6,1,6,1,6,1,6] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm15[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm9, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm5[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm15 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm8[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm13[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm13[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, (%rsp), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm9 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm2[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm12[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm9, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm13[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm10, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm7[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) @@ -2566,33 +2558,33 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 736(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm6 +; SSE-NEXT: movdqa 448(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa 432(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 400(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] @@ -2602,11 +2594,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1056(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1088(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2616,18 +2609,16 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm12 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2673,32 +2664,32 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm6 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm15 +; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 592(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2718,7 +2709,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1200(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -2732,41 +2723,43 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm14 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm8 -; SSE-NEXT: movdqa 496(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm9 +; SSE-NEXT: movdqa 496(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm13 +; SSE-NEXT: movdqa 800(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 816(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 848(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2777,17 +2770,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1120(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1136(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1168(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1168(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1152(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2798,45 +2791,43 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2845,15 +2836,15 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2863,49 +2854,75 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 784(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 704(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 944(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -2913,10 +2930,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 944(%rdi), %xmm1 +; SSE-NEXT: movdqa 1104(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2924,11 +2941,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm1 +; SSE-NEXT: movdqa 1024(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2936,12 +2954,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1104(%rdi), %xmm1 +; SSE-NEXT: movdqa 1264(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2949,25 +2967,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1024(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1264(%rdi), %xmm1 +; SSE-NEXT: movdqa 1184(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2975,27 +2980,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1184(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3004,37 +3005,30 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] @@ -3065,20 +3059,22 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3093,106 +3089,123 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3201,8 +3214,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3211,20 +3224,29 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] @@ -3236,107 +3258,77 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -3344,8 +3336,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -3353,8 +3344,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload @@ -3362,7 +3352,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -3370,8 +3360,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload @@ -3383,12 +3372,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3405,15 +3394,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm15[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -3529,7 +3517,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, (%rcx) ; SSE-NEXT: movapd %xmm13, 240(%r8) -; SSE-NEXT: movaps (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 208(%r8) @@ -3575,7 +3563,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm14, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: addq $1928, %rsp # imm = 0x788 ; SSE-NEXT: retq @@ -3583,322 +3571,321 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-LABEL: load_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $2488, %rsp # imm = 0x9B8 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm6[1,3],ymm2[6,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm5[1,3],ymm2[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm8[1,3],ymm2[6,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm9[1,3],ymm2[6,5],ymm9[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm7[1,3],ymm2[6,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm14[1,3],ymm2[6,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm4[1,3],ymm0[6,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm10[1,3],ymm2[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm11[1,3],ymm2[6,5],ymm11[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[2,0],ymm0[7,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,1],ymm0[6,4],ymm4[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] @@ -3911,10 +3898,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm9[2,0],ymm0[7,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,1],ymm0[6,4],ymm9[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -3930,10 +3917,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,1],ymm0[6,4],ymm13[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] @@ -3943,18 +3930,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm6[0,0],ymm1[5,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,1],ymm0[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[2,0],ymm0[7,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] @@ -3969,24 +3957,24 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm14[2,0],ymm0[7,4],ymm14[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,1],ymm1[6,4],ymm14[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm0[0,0],ymm11[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm0[0,0],ymm12[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm10[2,1],ymm2[6,4],ymm10[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm7[2,1],ymm2[6,4],ymm7[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] @@ -3994,27 +3982,26 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[1,0],ymm0[0,0],ymm14[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[1,0],ymm0[0,0],ymm13[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm13[2,1],ymm3[6,4],ymm13[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[2,1],ymm3[6,4],ymm10[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2],xmm12[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm12 = xmm12[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,0],ymm1[0,0],ymm12[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm1[0,0],ymm11[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] @@ -4029,9 +4016,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm13[2,0],mem[1,0],ymm13[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4039,8 +4026,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0],ymm1[3,0],ymm8[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] @@ -4053,9 +4040,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm9[3,0],ymm15[4,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm9[2,2],ymm15[6,4],ymm9[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4074,8 +4062,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm6[1,0],ymm1[6,4],ymm6[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4083,11 +4070,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm3[1,0],ymm12[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm3[1,0],ymm11[6,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4095,26 +4082,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm10[3,0],ymm9[4,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm7[3,0],ymm9[4,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm10[1,0],ymm14[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm10[1,0],ymm13[6,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm6[3,0],ymm3[4,4],ymm6[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm6[2,2],ymm15[6,4],ymm6[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm8[3,0],ymm6[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm6[1,0],ymm11[6,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm3[1,0],ymm12[6,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4122,22 +4110,23 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm7[3,0],ymm1[4,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm8[3,0],ymm1[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm2[1,0],ymm4[6,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4149,26 +4138,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm12[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3,4],mem[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] @@ -4176,16 +4166,15 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 16-byte Folded Reload @@ -4206,13 +4195,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm11[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload @@ -4309,9 +4298,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4321,17 +4310,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] @@ -4340,44 +4329,44 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2,3],ymm15[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm12[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4405,11 +4394,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4421,28 +4410,29 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd $51, (%rsp), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0] ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 @@ -4450,75 +4440,77 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm7[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 944(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 1264(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm15[2,3],ymm4[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm10[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 784(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -4529,12 +4521,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 @@ -4543,10 +4533,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm7[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 @@ -4554,12 +4545,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 @@ -4567,12 +4557,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm9 ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 @@ -4581,8 +4572,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] @@ -4593,12 +4584,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm5[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm4[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 @@ -4606,11 +4596,13 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm14 ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5,6],ymm15[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm1 @@ -4618,9 +4610,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vinserti128 $1, 1056(%rdi), %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4633,8 +4625,9 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload @@ -4645,9 +4638,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload @@ -4661,17 +4653,17 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload @@ -4680,73 +4672,73 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm2[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm6, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm6[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm11[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm9[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm7, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -4759,14 +4751,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm3[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm3[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload @@ -4780,19 +4772,19 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm11[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm12[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload @@ -4801,14 +4793,14 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -4817,60 +4809,60 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-ONLY-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX2-ONLY-NEXT: vzeroupper @@ -4879,201 +4871,203 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i32_stride5_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm6, %zmm24 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,5,10,15,20,25,30,u> -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm19, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <17,22,27,0,5,10,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm18, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm16, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm26 = <2,7,12,17,22,27,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm26, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm22, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm29, %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm21, %zmm31, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm29, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm29, %zmm17 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <3,8,13,18,23,28,u,u> -; AVX512F-NEXT: vpermt2d %zmm10, %zmm31, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm11, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <4,9,14,19,24,29,u,u> -; AVX512F-NEXT: vpermt2d %zmm13, %zmm31, %zmm20 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm26, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm14, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm28, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm28, %zmm26 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u> +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u> +; AVX512F-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm15, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm12, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm25, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm2 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm26, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm9, %zmm11 ; AVX512F-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm15, %zmm7 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm31 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm19 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm15, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm12 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm28 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm20 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm29, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm30, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5081,201 +5075,203 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride5_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm6, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,5,10,15,20,25,30,u> -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm19, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <17,22,27,0,5,10,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm18, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <2,7,12,17,22,27,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm26, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm22, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm29, %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm31, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm29, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm29, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,8,13,18,23,28,u,u> -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm11, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <4,9,14,19,24,29,u,u> -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm31, %zmm20 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm26, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm14, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u> +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u> +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm25, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm26, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm9, %zmm11 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm31 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm12 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm28 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm30, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index d179de0a039d3..2d8ada07be825 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -19,29 +19,29 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i32_stride6_vf2: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movq %xmm0, (%rsi) +; SSE-NEXT: movq %xmm1, (%rsi) ; SSE-NEXT: movq %xmm4, (%rdx) ; SSE-NEXT: movq %xmm5, (%rcx) ; SSE-NEXT: movq %xmm6, (%r8) -; SSE-NEXT: movq %xmm1, (%r9) +; SSE-NEXT: movq %xmm0, (%r9) ; SSE-NEXT: movq %xmm7, (%rax) ; SSE-NEXT: retq ; @@ -255,47 +255,48 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm9, (%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm4, (%rsi) ; SSE-NEXT: movapd %xmm3, (%rdx) ; SSE-NEXT: movapd %xmm5, (%rcx) -; SSE-NEXT: movapd %xmm10, (%r8) -; SSE-NEXT: movapd %xmm8, (%r9) +; SSE-NEXT: movapd %xmm6, (%r8) +; SSE-NEXT: movapd %xmm9, (%r9) ; SSE-NEXT: movapd %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -433,124 +434,122 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 ; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm6 ; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm7 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1] +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] -; SSE-NEXT: movapd %xmm10, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movapd %xmm12, 16(%rdx) ; SSE-NEXT: movapd %xmm11, (%rdx) ; SSE-NEXT: movapd %xmm13, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm4, 16(%r8) -; SSE-NEXT: movapd %xmm7, (%r8) -; SSE-NEXT: movapd %xmm0, 16(%r9) -; SSE-NEXT: movapd %xmm5, (%r9) +; SSE-NEXT: movapd %xmm8, (%r8) +; SSE-NEXT: movapd %xmm10, 16(%r9) +; SSE-NEXT: movapd %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm9, 16(%rax) -; SSE-NEXT: movapd %xmm6, (%rax) +; SSE-NEXT: movapd %xmm2, 16(%rax) +; SSE-NEXT: movapd %xmm9, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf8: @@ -953,511 +952,516 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 -; SSE-NEXT: movdqa 240(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: movdqa 336(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm7 -; SSE-NEXT: movdqa 288(%rdi), %xmm12 -; SSE-NEXT: movdqa 304(%rdi), %xmm8 -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: subq $408, %rsp # imm = 0x198 +; SSE-NEXT: movdqa 240(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: movdqa 192(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa 336(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm5 +; SSE-NEXT: movdqa 288(%rdi), %xmm15 +; SSE-NEXT: movdqa 304(%rdi), %xmm7 +; SSE-NEXT: movdqa 64(%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm15 -; SSE-NEXT: movdqa 160(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 144(%rdi), %xmm9 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa 272(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: movdqa 224(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movapd %xmm14, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movapd %xmm0, 16(%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movapd %xmm15, 16(%r8) +; SSE-NEXT: movapd %xmm12, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm2, 16(%r9) ; SSE-NEXT: movapd %xmm3, 32(%r9) ; SSE-NEXT: movapd %xmm4, 48(%r9) -; SSE-NEXT: movapd %xmm15, (%r9) +; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movapd %xmm13, 16(%rax) ; SSE-NEXT: movapd %xmm9, 32(%rax) -; SSE-NEXT: movapd %xmm12, 48(%rax) +; SSE-NEXT: movapd %xmm8, 48(%rax) ; SSE-NEXT: movapd %xmm10, (%rax) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,0],ymm6[0,0],ymm2[6,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,2],ymm4[6,4],ymm6[6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm2[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0],ymm14[0],ymm7[3],ymm14[2] +; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,0],ymm2[0,0],ymm3[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm2[2,2],ymm5[6,4],ymm2[6,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm8[0],ymm13[0],ymm8[3],ymm13[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm1[2,2],ymm4[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[2,0],ymm1[0,0],ymm10[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm1[2,2],ymm5[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm5[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[0,1] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm12[0],ymm5[3],ymm12[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[0,1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm4[0],ymm12[0],ymm4[3],ymm12[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[3,0],ymm6[1,0],ymm13[7,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,0],ymm6[2,3],ymm11[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,0],xmm2[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,2],xmm2[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,1],ymm14[1,3],ymm7[7,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm2[2,3],ymm11[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm13[1,3],ymm8[7,5],ymm13[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,0],ymm1[1,0],ymm10[7,4],ymm1[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,0],xmm0[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1],ymm12[1,3],ymm5[7,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1],ymm12[1,3],ymm4[7,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[2,1],ymm8[2,0],ymm13[6,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm14[0,1],ymm10[2],ymm14[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0],ymm5[2,0],ymm7[4,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm7[2,0],ymm9[6,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0],ymm4[2,0],ymm8[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,1],ymm6[2,0],ymm4[6,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,1],ymm3[2,0],ymm10[6,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,0],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,0],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,0],ymm4[2,0],ymm0[4,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[3,1],ymm7[4,5],ymm5[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm3[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,1],ymm8[2,1],ymm9[7,5],ymm8[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm14[2,0],ymm0[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm4[3,1],ymm0[4,5],ymm4[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[3,1],xmm15[3,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,1],ymm6[2,1],ymm13[7,5],ymm6[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[3,1],ymm8[4,5],ymm4[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm5[3,3] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm14[3,1],ymm0[4,5],ymm14[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[3,1],xmm15[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm7[1],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm8[0,0],ymm0[6,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2],ymm9[2,0],ymm8[4,6],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm2[1],ymm6[0],ymm2[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,0],ymm13[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm12[0,0],ymm9[6,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,2],ymm15[2,0],ymm12[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[1,0],ymm0[7,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,3],ymm0[2,0],ymm8[4,7],ymm0[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps $215, (%rsp), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1],ymm5[2,0],ymm14[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm12[1,0],ymm9[7,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,3],ymm4[2,0],ymm12[4,7],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm3[1,3],ymm2[7,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm2[2,0],ymm13[5,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm6[1],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,0],ymm7[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,2],ymm11[2,0],ymm8[4,6],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,0],ymm4[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,0],ymm14[0,0],ymm11[6,4],ymm14[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,0],ymm8[1,0],ymm9[7,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,3],ymm7[2,0],ymm8[4,7],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1],ymm5[2,0],ymm13[5,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,0],ymm14[1,0],ymm11[7,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,3],ymm5[2,0],ymm14[4,7],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[2,0],ymm10[5,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $200, %rsp -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $232, %rsp +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[0,1],ymm7[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1467,69 +1471,70 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1538,78 +1543,79 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm5 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [4,2,4,2] ; AVX2-SLOW-NEXT: # xmm4 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm8[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm2 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-SLOW-NEXT: vmovaps %ymm13, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rax) -; AVX2-SLOW-NEXT: addq $200, %rsp +; AVX2-SLOW-NEXT: addq $232, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride6_vf16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $200, %rsp -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 @@ -1617,25 +1623,25 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[0,1],ymm7[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1645,41 +1651,41 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] @@ -1690,23 +1696,24 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1715,56 +1722,56 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] ; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm14 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] -; AVX2-FAST-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1],ymm4[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm2 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload @@ -1782,37 +1789,37 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm12[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[0,1],ymm7[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[0,1],ymm9[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm14 @@ -1822,69 +1829,70 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm5 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1893,68 +1901,69 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm10, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [4,2,4,2] ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm8[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm2 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2152,478 +2161,477 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i32_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: subq $1032, %rsp # imm = 0x408 +; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 528(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa 528(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm4 +; SSE-NEXT: movdqa 544(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 480(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm15 -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm12 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: movdqa 496(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 384(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 400(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 304(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm3 +; SSE-NEXT: movdqa 336(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 688(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 720(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa 720(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa 736(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: movdqa 592(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm7 +; SSE-NEXT: movdqa 592(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: movdqa 224(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa 512(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa 464(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa 416(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa 752(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa 752(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa 704(%rdi), %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm9 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movapd %xmm15, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2632,6 +2640,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, %xmm11 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2641,7 +2650,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm14, %xmm12 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2649,20 +2658,20 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, %xmm10 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2670,21 +2679,22 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -2743,9 +2753,9 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) @@ -2758,32 +2768,31 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, 112(%r9) ; SSE-NEXT: movapd %xmm3, 96(%r9) ; SSE-NEXT: movapd %xmm4, 80(%r9) -; SSE-NEXT: movapd %xmm5, 64(%r9) -; SSE-NEXT: movapd %xmm6, 48(%r9) -; SSE-NEXT: movapd %xmm13, 32(%r9) -; SSE-NEXT: movapd %xmm9, 16(%r9) +; SSE-NEXT: movapd %xmm6, 64(%r9) +; SSE-NEXT: movapd %xmm8, 48(%r9) +; SSE-NEXT: movapd %xmm9, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm14, 112(%rax) -; SSE-NEXT: movapd %xmm10, 96(%rax) -; SSE-NEXT: movapd %xmm7, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movapd %xmm13, 96(%rax) +; SSE-NEXT: movapd %xmm15, 80(%rax) +; SSE-NEXT: movapd %xmm10, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movapd %xmm15, 16(%rax) -; SSE-NEXT: movapd %xmm12, (%rax) -; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8 +; SSE-NEXT: movapd %xmm12, 32(%rax) +; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) +; SSE-NEXT: addq $1032, %rsp # imm = 0x408 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1064, %rsp # imm = 0x428 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 @@ -2791,11 +2800,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 @@ -2803,10 +2812,10 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm8[0,0],ymm1[6,4],ymm8[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,2],ymm0[6,4],ymm8[6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm2[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2814,15 +2823,15 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm5[0,0],ymm4[6,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,2],ymm0[6,4],ymm5[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm6[0,0],ymm4[6,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm9[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2845,18 +2854,18 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2866,54 +2875,54 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm7[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm4[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm12[0,1] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm12[0],ymm0[0],ymm12[3],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[0,1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,0],ymm8[1,0],ymm15[7,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[2,0],ymm8[2,3],ymm14[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],xmm6[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm11[0,2],xmm6[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[0,2],xmm7[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm11[1,3],ymm0[7,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[3,0],ymm5[1,0],ymm14[7,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,0],ymm5[2,3],ymm6[6,4],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,0],xmm4[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[0,2],xmm4[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,0],xmm5[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[0,2],xmm5[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm8[1,3],ymm0[7,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm14[1,3],ymm0[7,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,0],ymm3[1,0],ymm0[7,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,0],xmm2[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,0],ymm3[1,0],ymm0[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,0],xmm2[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload @@ -2921,73 +2930,73 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm9[1,0],ymm5[7,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm9[1,0],ymm11[7,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm9[2,3],ymm2[6,4],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,0],xmm1[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm1[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm13[1,3],ymm12[7,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm13[1,3],ymm10[7,5],ymm13[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm15[2,1],mem[2,0],ymm15[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm4[2,0],ymm0[4,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm3[2,0],ymm0[4,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,1],ymm11[2,0],ymm5[6,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm9[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,1],ymm6[2,0],ymm1[6,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[2,1],ymm8[2,0],ymm5[6,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3] @@ -2996,45 +3005,45 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[3,1],ymm5[4,5],ymm4[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm3[3,1],ymm1[4,5],ymm3[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm4[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm13[2,1],ymm5[7,5],ymm13[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm13[2,1],ymm11[7,5],ymm13[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[3,1],xmm14[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,1],ymm6[2,1],ymm1[7,5],ymm6[6,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[3,1],ymm1[4,5],ymm2[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm6[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm6[2,1],ymm4[7,5],ymm6[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm8[2,1],ymm5[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm9[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm11[2,1],ymm2[7,5],ymm11[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm12[2,1],ymm2[7,5],ymm12[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -3042,7 +3051,6 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -3051,86 +3059,95 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm1[2,0],ymm5[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,0],ymm11[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm13[0],ymm1[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm0[0,0],ymm4[6,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm2[2,0],ymm0[4,6],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm15[0],ymm2[2],ymm15[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,0],ymm12[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm3[2,0],ymm1[4,6],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm2[0,0],ymm5[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0],xmm3[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm9[1],ymm3[0],ymm9[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[0,1],ymm15[2,0],ymm7[4,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,0],ymm15[0,0],ymm13[6,4],ymm15[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[0,2],ymm12[2,0],ymm15[4,6],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm0[1,0],ymm4[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,1],ymm4[2,0],ymm10[5,5],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,3],ymm0[2,0],ymm5[4,7],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2,0],ymm4[4,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,0],ymm0[0,0],ymm3[6,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm6[2,0],ymm1[4,7],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps $12, (%rsp), %xmm15, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm7[1,0],ymm1[7,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,3],ymm1[2,0],ymm7[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm2[1,0],ymm5[7,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,3],ymm1[2,0],ymm2[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm13[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] @@ -3138,28 +3155,16 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1],ymm5[2,0],ymm10[5,5],ymm5[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[2,0],ymm1[4,7],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm2[2,0],ymm6[5,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,0],ymm15[1,0],ymm13[7,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,3],ymm1[2,0],ymm15[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,1],ymm3[1,3],ymm9[7,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1],ymm3[2,0],ymm7[5,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[2,0],ymm4[5,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3192,7 +3197,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3200,11 +3205,11 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: addq $1064, %rsp # imm = 0x428 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3212,247 +3217,248 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm9, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, (%rsp), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm14 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm15[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3461,161 +3467,165 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] -; AVX2-SLOW-NEXT: # xmm14 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm11 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm4 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm13, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm13, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm13, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) ; AVX2-SLOW-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3624,170 +3634,170 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm12, %ymm15 +; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload @@ -3796,72 +3806,73 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm3[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3870,159 +3881,165 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] -; AVX2-FAST-NEXT: # xmm14 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm11 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm4 = mem[0,0] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -4031,247 +4048,248 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm12, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm9, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, (%rsp), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm15[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4280,161 +4298,165 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm13, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -4796,11 +4818,12 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1248(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1248(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1264(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1296(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4822,72 +4845,72 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 448(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 768(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 816(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 832(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1152(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1168(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1200(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm7 -; SSE-NEXT: movdqa 304(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm10 +; SSE-NEXT: movdqa 336(%rdi), %xmm7 ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 672(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 688(%rdi), %xmm0 @@ -4899,10 +4922,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1056(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1072(%rdi), %xmm0 @@ -4910,507 +4933,462 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1104(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1104(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1120(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1440(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1440(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1456(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 1488(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1504(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm3 -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 208(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm6 -; SSE-NEXT: movdqa 592(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm10 +; SSE-NEXT: movdqa 592(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa 624(%rdi), %xmm11 -; SSE-NEXT: movdqa 640(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa 640(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm8[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE-NEXT: movdqa 1008(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 1008(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1024(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1360(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: movdqa 1392(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1408(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm15[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm14[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1360(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 1392(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1408(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm14[0],xmm7[1] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm14[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 272(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa 464(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 416(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa 512(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 416(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa 512(%rdi), %xmm9 +; SSE-NEXT: movdqa 656(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 752(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa 704(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 752(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa 800(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 848(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa 800(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 944(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 896(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movdqa 944(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 896(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] ; SSE-NEXT: movdqa 1040(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa 992(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 992(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: movdqa 1136(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1088(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1088(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa 1232(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa 1184(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 1232(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa 1184(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 1328(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1280(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 1424(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1376(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 1520(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1472(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, (%rsp), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[3,3,3,3] @@ -5419,43 +5397,34 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd $255, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -5472,29 +5441,39 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5510,29 +5489,59 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5540,11 +5549,23 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5554,17 +5575,19 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5574,7 +5597,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5584,20 +5608,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5605,17 +5629,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5625,71 +5649,68 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] @@ -5705,7 +5726,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm4 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5757,9 +5778,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5800,9 +5822,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5810,21 +5833,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5832,14 +5854,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5848,18 +5869,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] @@ -5994,11 +6017,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, 240(%r9) ; SSE-NEXT: movapd %xmm3, 224(%r9) ; SSE-NEXT: movapd %xmm5, 208(%r9) -; SSE-NEXT: movapd %xmm6, 192(%r9) -; SSE-NEXT: movapd %xmm7, 176(%r9) +; SSE-NEXT: movapd %xmm7, 192(%r9) +; SSE-NEXT: movapd %xmm13, 176(%r9) ; SSE-NEXT: movapd %xmm8, 160(%r9) ; SSE-NEXT: movapd %xmm9, 144(%r9) -; SSE-NEXT: movapd %xmm12, 128(%r9) +; SSE-NEXT: movapd %xmm11, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6017,7 +6040,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm14, 240(%rax) -; SSE-NEXT: movapd %xmm13, 224(%rax) +; SSE-NEXT: movapd %xmm12, 224(%rax) ; SSE-NEXT: movapd %xmm15, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) @@ -6043,14 +6066,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movapd %xmm4, (%rax) ; SSE-NEXT: addq $2184, %rsp # imm = 0x888 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2536, %rsp # imm = 0x9E8 +; AVX1-ONLY-NEXT: subq $2584, %rsp # imm = 0xA18 ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 @@ -6075,10 +6097,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6094,9 +6115,10 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm14[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6112,18 +6134,19 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm13[0,0],ymm1[6,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,2],ymm0[6,4],ymm13[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,2],ymm0[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm13[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6139,18 +6162,18 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm10[0,0],ymm1[6,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,2],ymm0[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm11[0,0],ymm1[6,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,2],ymm0[6,4],ymm11[6,6] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6202,9 +6225,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6220,273 +6243,273 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm4[0,0],ymm1[6,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,2],ymm0[6,4],ymm4[6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[0,0],ymm1[6,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,2],ymm0[6,4],ymm3[6,6] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,2],ymm0[6,4],ymm2[6,6] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm1[0,0],ymm5[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm5[0,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm15[0,1] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[0,1] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm9[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm9[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[0,2],xmm14[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,3],ymm14[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[0,2],xmm13[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1],ymm15[1,3],ymm0[7,5],ymm15[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm13[1,0],ymm14[7,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,3],ymm0[6,4],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,0],xmm11[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,2],xmm11[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[3,0],ymm11[1,0],ymm14[7,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,0],ymm11[2,3],ymm13[6,4],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[0,2],xmm10[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[3,1],ymm13[1,3],ymm1[7,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,1],ymm13[1,3],ymm0[7,5],ymm13[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm10[1,0],ymm11[7,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,3],ymm0[6,4],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm10[0,2],xmm9[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm10[1,3],ymm1[7,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,0],ymm8[1,0],ymm11[7,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[1,0],ymm0[7,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,3],ymm0[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[0,2],xmm7[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,2],xmm7[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm6[1,0],ymm0[7,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,3],ymm0[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,0],xmm5[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,2],xmm5[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,0],xmm4[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm4[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1],ymm7[1,3],ymm1[7,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[1,0],ymm0[7,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,3],ymm0[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,0],xmm3[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,2],xmm3[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm7[1,3],ymm0[7,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,0],xmm2[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,1],ymm4[1,3],ymm1[7,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm4[1,3],ymm0[7,5],ymm4[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm1[1,0],ymm3[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,0],xmm12[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm12[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm5[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm5[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm12[2,0],ymm1[4,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm9[2,1],mem[2,0],ymm9[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm8[2,0],ymm1[4,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm6[2,0],ymm1[4,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] @@ -6496,114 +6519,114 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm6[2,1],mem[2,0],ymm6[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[0,0],ymm2[2,0],ymm13[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm6[2,0],ymm13[4,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm3[2,1],ymm5[2,0],ymm3[6,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[2,0],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,1],ymm3[2,0],ymm4[6,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm14[2,0],ymm1[4,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,1],mem[3,1],ymm10[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm11[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,1],mem[3,1],ymm10[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm11[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm15[2,0],ymm2[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm12[3,1],ymm0[4,5],ymm12[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[3,1],ymm14[2,1],ymm1[7,5],ymm14[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,1],ymm8[3,1],ymm10[4,5],ymm8[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm0[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,1],mem[2,1],ymm0[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm10[3,1],ymm0[4,5],ymm10[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm1[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[3,1],ymm8[4,5],ymm6[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[2,1],ymm10[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm14[3,1],ymm1[4,5],ymm14[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm5[2,1],ymm4[7,5],ymm5[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm2[3,1],ymm13[4,5],ymm2[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm15[3,1],ymm2[4,5],ymm15[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,1],xmm7[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm8[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm7[2,1],ymm3[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm15[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm5[2,1],ymm4[7,5],ymm5[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -6614,9 +6637,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,1],ymm8[2,1],ymm6[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -6632,18 +6655,19 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm8[0,0],ymm2[6,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm1[2,0],ymm8[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm10[0,0],ymm2[6,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm1[2,0],ymm10[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6652,16 +6676,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,0],ymm1[6,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,0],ymm6[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6676,7 +6697,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] @@ -6691,7 +6712,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6716,7 +6737,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] @@ -6725,8 +6746,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6740,14 +6761,14 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,0],ymm2[0,0],ymm10[6,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm2[0,0],ymm14[6,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6756,223 +6777,223 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 1232(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm1[0,0],ymm9[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm7[2,0],ymm1[4,6],ymm7[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[1],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,0],ymm13[4,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm9[1],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,0],ymm9[4,5],ymm15[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,0],ymm0[0,0],ymm7[6,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm8[1,0],ymm13[7,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm13[2,0],ymm8[4,7],ymm13[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[3,0],ymm10[1,0],ymm9[7,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,3],ymm13[2,0],ymm10[4,7],ymm13[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,3],ymm8[2,0],ymm6[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[1,3],ymm13[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,1],ymm13[2,0],ymm15[5,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm5[1,0],ymm8[7,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm8[2,0],ymm5[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[1,3],ymm13[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,1],ymm13[2,0],ymm15[5,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm8[2,0],ymm4[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[1,3],ymm12[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1],ymm12[2,0],ymm13[5,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm8[2,0],ymm3[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[1,3],ymm11[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,0],ymm2[1,0],ymm10[7,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm8[2,0],ymm2[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[3,0],ymm1[1,0],ymm9[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm7[1,0],ymm10[7,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm13[2,0],ymm7[4,7],ymm13[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,1],ymm9[2,0],ymm10[5,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm0[1,0],ymm7[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm7[2,0],ymm0[4,7],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm8[2,0],ymm9[5,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r9) +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[3,0],ymm5[1,0],ymm10[7,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm13[2,0],ymm5[4,7],ymm13[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,1],ymm13[2,0],ymm10[5,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm11[2,0],ymm3[4,7],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm10[1,1],ymm12[2,0],ymm10[5,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[3,0],ymm2[1,0],ymm14[7,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm10[2,0],ymm2[4,7],ymm10[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[1,3],ymm11[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm0[1,0],ymm6[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm6[2,0],ymm0[4,7],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,1],ymm8[2,0],ymm10[5,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) @@ -6980,9 +7001,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rax) -; AVX1-ONLY-NEXT: addq $2536, %rsp # imm = 0x9E8 +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: addq $2584, %rsp # imm = 0xA18 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6992,44 +7013,44 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7037,7 +7058,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -7053,7 +7074,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -7062,7 +7083,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -7078,7 +7099,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -7087,7 +7108,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm1 @@ -7095,21 +7116,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm1 @@ -7117,21 +7138,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 864(%rdi), %ymm1 @@ -7144,16 +7165,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -7161,73 +7182,73 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm11, %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -7279,8 +7300,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] @@ -7298,17 +7319,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] @@ -7370,49 +7391,49 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,3,3,3] @@ -7428,9 +7449,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7457,27 +7478,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] @@ -7485,20 +7506,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -7508,17 +7529,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -7544,9 +7565,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -7557,138 +7578,140 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -7697,22 +7720,13 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7720,8 +7734,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7729,175 +7743,184 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2472, %rsp # imm = 0x9A8 +; AVX2-FAST-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7905,7 +7928,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -7921,7 +7944,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -7930,7 +7953,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -7941,12 +7964,12 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1376(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -7955,7 +7978,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 @@ -7963,21 +7986,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm1 @@ -7985,21 +8008,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 864(%rdi), %ymm1 @@ -8012,16 +8035,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -8029,205 +8052,206 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2,3],ymm1[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1,2,3],ymm7[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm9 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -8236,151 +8260,150 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1,2,3],ymm2[4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, (%rsp), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -8392,8 +8415,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -8406,86 +8429,87 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm2 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -8494,64 +8518,64 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm13[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -8560,161 +8584,160 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-FAST-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) -; AVX2-FAST-NEXT: addq $2472, %rsp # imm = 0x9A8 +; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 160(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -8724,44 +8747,44 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8769,7 +8792,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -8785,7 +8808,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 @@ -8794,7 +8817,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -8810,7 +8833,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 @@ -8819,7 +8842,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm1 @@ -8827,21 +8850,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm1 @@ -8849,21 +8872,21 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 864(%rdi), %ymm1 @@ -8876,16 +8899,16 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -8893,73 +8916,73 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm5, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -9011,8 +9034,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] @@ -9030,17 +9053,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] @@ -9102,49 +9125,49 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,3,3,3] @@ -9160,9 +9183,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9189,27 +9212,27 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] @@ -9217,20 +9240,20 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9240,17 +9263,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9276,9 +9299,9 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -9289,138 +9312,140 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -9429,22 +9454,22 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9452,8 +9477,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9461,8 +9486,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9471,117 +9496,117 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -9589,350 +9614,349 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i32_stride6_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm6, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm31, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm9, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm18, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm31, %zmm27 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <0,6,12,18,24,30,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm10, %zmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <1,7,13,19,25,31,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm3, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <3,9,15,21,27,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <20,26,0,6,12,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm22 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm10, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm12, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm30 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512F-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: movw $31, %ax ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -9940,350 +9964,349 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm31, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm9, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm27 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,6,12,18,24,30,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm10, %zmm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,7,13,19,25,31,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm3, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <3,9,15,21,27,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <20,26,0,6,12,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm22 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm30 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: movw $31, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 97f499968e8e0..ab44439f98ff3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -281,56 +281,56 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm11[0],xmm7[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[1,1,1,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm6[0],xmm2[1] -; SSE-NEXT: movapd %xmm7, (%rsi) -; SSE-NEXT: movapd %xmm5, (%rdx) -; SSE-NEXT: movapd %xmm9, (%rcx) +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm5, (%rsi) +; SSE-NEXT: movapd %xmm6, (%rdx) +; SSE-NEXT: movapd %xmm11, (%rcx) ; SSE-NEXT: movapd %xmm8, (%r8) ; SSE-NEXT: movapd %xmm4, (%r9) ; SSE-NEXT: movapd %xmm0, (%rdi) @@ -617,140 +617,146 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa 144(%rdi), %xmm11 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movdqa 144(%rdi), %xmm9 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 192(%rdi), %xmm8 +; SSE-NEXT: movdqa 160(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm15 ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm2[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movdqa 208(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm12[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm10, (%rdx) -; SSE-NEXT: movapd %xmm9, 16(%rdx) -; SSE-NEXT: movapd %xmm15, (%rcx) -; SSE-NEXT: movapd %xmm12, 16(%rcx) -; SSE-NEXT: movapd %xmm4, (%r8) -; SSE-NEXT: movapd %xmm6, 16(%r8) -; SSE-NEXT: movapd %xmm14, (%r9) -; SSE-NEXT: movapd %xmm11, 16(%r9) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movapd %xmm8, (%rcx) +; SSE-NEXT: movapd %xmm13, 16(%rcx) +; SSE-NEXT: movapd %xmm15, (%r8) +; SSE-NEXT: movapd %xmm7, 16(%r8) +; SSE-NEXT: movapd %xmm2, (%r9) +; SSE-NEXT: movapd %xmm9, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: movapd %xmm7, 16(%rax) +; SSE-NEXT: movapd %xmm14, (%rax) +; SSE-NEXT: movapd %xmm10, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm5, (%rax) -; SSE-NEXT: movapd %xmm3, 16(%rax) -; SSE-NEXT: popq %rax +; SSE-NEXT: movapd %xmm3, (%rax) +; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf8: @@ -1245,297 +1251,306 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa 272(%rdi), %xmm2 -; SSE-NEXT: movdqa 224(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 +; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm5 +; SSE-NEXT: movdqa 224(%rdi), %xmm15 +; SSE-NEXT: movdqa 240(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm7 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa 384(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa 416(%rdi), %xmm8 +; SSE-NEXT: movdqa 384(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm15[0],xmm9[1] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm14[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm14[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm14[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm15[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm15[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm15[0],xmm14[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 16(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1552,24 +1567,24 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm7, 48(%r9) -; SSE-NEXT: movapd %xmm5, 32(%r9) -; SSE-NEXT: movapd %xmm9, (%r9) +; SSE-NEXT: movapd %xmm9, 48(%r9) +; SSE-NEXT: movapd %xmm14, 32(%r9) +; SSE-NEXT: movapd %xmm12, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm6, 48(%rax) -; SSE-NEXT: movapd %xmm1, 32(%rax) +; SSE-NEXT: movapd %xmm13, 48(%rax) +; SSE-NEXT: movapd %xmm15, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 48(%rax) -; SSE-NEXT: movapd %xmm12, 32(%rax) -; SSE-NEXT: movapd %xmm11, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movapd %xmm4, 48(%rax) +; SSE-NEXT: movapd %xmm6, 32(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) +; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf16: @@ -1579,7 +1594,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 @@ -1590,8 +1605,8 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 @@ -1606,11 +1621,11 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 @@ -1618,173 +1633,173 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm5[2,2],ymm7[5,5],ymm5[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0],xmm15[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm8[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm2[3,3],ymm1[4,4],ymm2[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm12[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm6[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1],ymm4[2,2],ymm13[5,5],ymm4[6,6] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0],xmm9[1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm3[0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm4[0,3],ymm12[7,5],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,1],ymm12[2,0],ymm13[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm7[0,0],ymm5[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm8[1,3],ymm7[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,2],ymm5[2,0],ymm1[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm13[0,0],ymm4[5,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,1],ymm2[0,2],ymm13[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,1],ymm6[1,3],ymm2[4,5],ymm6[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm7[2,0],ymm1[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm3[1,3],ymm5[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,2],ymm4[2,0],ymm0[4,6],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm2[0,0],ymm1[7,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,0],ymm8[2,0],ymm7[5,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1],ymm4[1,3],ymm5[4,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[0,2],ymm8[2,0],ymm0[4,6],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm6[2,0],ymm2[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm3[0,1,2],xmm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[0,0],ymm0[7,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,0],ymm0[6,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,1],ymm8[3,3],ymm7[6,5],ymm8[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,1],ymm6[3,3],ymm2[6,5],ymm6[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[0,1],xmm2[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,1],ymm3[3,3],ymm5[6,5],ymm3[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[0,0],ymm13[1,0],ymm14[4,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1],xmm3[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,1],ymm4[3,3],ymm5[6,5],ymm4[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm3[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,0],ymm6[0,0],ymm7[7,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm12[1,0],ymm11[4,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm6[0,0],ymm2[7,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm1[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,0],ymm6[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm6[0,0],ymm5[7,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,0],ymm12[2,0],ymm11[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm7[0,0],ymm5[7,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1802,11 +1817,11 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -2042,37 +2057,35 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm5[1] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm12[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm0 @@ -2088,67 +2101,67 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm7 = ymm9[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm15 ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm4 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm11[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -2161,105 +2174,106 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm13[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm3 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -2494,115 +2508,115 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512F-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,8,15,22,29,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512F-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm14 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm7, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 ; AVX512F-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <18,25,0,7,14,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm6 {%k2} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u> +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm9, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm6 {%k1} +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [5,12,19,26] -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm8 -; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm7, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm0 -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2611,115 +2625,115 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm9 ; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm8, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,8,15,22,29,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: movw $480, %di # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm14 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm7, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm8, %zmm15 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <18,25,0,7,14,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm6 {%k2} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u> +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm9, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm6 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm10, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm10, %zmm15 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11,18,25] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [5,12,19,26] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm8 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,12,19,26] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2744,503 +2758,491 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa 640(%rdi), %xmm2 -; SSE-NEXT: movdqa 608(%rdi), %xmm3 -; SSE-NEXT: movdqa 560(%rdi), %xmm8 +; SSE-NEXT: subq $1160, %rsp # imm = 0x488 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm3 +; SSE-NEXT: movdqa 608(%rdi), %xmm4 +; SSE-NEXT: movdqa 560(%rdi), %xmm10 +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm2 -; SSE-NEXT: movdqa 496(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa 528(%rdi), %xmm9 +; SSE-NEXT: movdqa 496(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa 384(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 384(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm2 -; SSE-NEXT: movdqa 832(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm3 +; SSE-NEXT: movdqa 832(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa 752(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 720(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa 480(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] ; SSE-NEXT: movdqa 704(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 736(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 768(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3249,54 +3251,59 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3316,9 +3323,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -3326,22 +3332,20 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -3350,7 +3354,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -3413,7 +3418,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) @@ -3421,7 +3426,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%r9) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%r9) @@ -3436,9 +3441,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm12, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rax) +; SSE-NEXT: movaps %xmm1, 112(%rax) +; SSE-NEXT: movapd %xmm15, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3459,18 +3464,18 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm6, 48(%rax) ; SSE-NEXT: movapd %xmm7, 32(%rax) ; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: addq $1176, %rsp # imm = 0x498 +; SSE-NEXT: movapd %xmm9, (%rax) +; SSE-NEXT: addq $1160, %rsp # imm = 0x488 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX1-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 @@ -3482,29 +3487,32 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 @@ -3513,8 +3521,9 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3524,27 +3533,24 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3554,14 +3560,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3580,461 +3586,453 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm9[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,0],ymm1[3,3],ymm6[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0],xmm8[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3],ymm11[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm2[3,3],ymm13[4,4],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm14[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0],ymm2[3,3],ymm12[4,4],ymm2[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1,2],mem[0] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm4[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,0],ymm5[3,3],ymm1[4,4],ymm5[7,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0],ymm5[3,3],ymm2[4,4],ymm5[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm11[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm6[2,2],ymm4[5,5],ymm6[6,6] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0],xmm10[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3],ymm3[0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[2,2],ymm3[5,5],ymm8[6,6] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm15[3,3],ymm5[4,4],ymm15[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0],xmm2[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm15[2,3],ymm5[0,1] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0],ymm13[3,3],ymm15[4,4],ymm13[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm13[1,2],xmm4[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[0,3],ymm13[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[2,1],ymm13[2,0],ymm11[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm10[0,3],ymm13[7,5],ymm10[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,1],ymm13[2,0],ymm7[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm8[0,3],ymm13[7,5],ymm8[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[2,1],ymm13[2,0],ymm9[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm15[1,2],xmm2[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[0,3],ymm15[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,1],ymm15[2,0],ymm5[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm9[0,3],ymm15[7,5],ymm9[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,1],ymm15[2,0],ymm8[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm6[0,3],ymm15[7,5],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,1],ymm15[2,0],ymm4[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm3[0],ymm12[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm3[0,3],ymm14[7,5],ymm3[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,1],ymm14[0,3],ymm8[7,5],ymm14[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1],ymm8[2,0],ymm4[6,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0],ymm1[0,0],ymm6[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1],ymm6[0,2],ymm1[7,5],ymm6[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,1],ymm14[1,3],ymm0[4,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm10[2,0],ymm5[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm8[0,0],ymm9[5,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm6[0,2],ymm8[7,5],ymm6[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[1,3],ymm1[4,5],ymm0[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,2],ymm10[2,0],ymm13[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[0,0],ymm3[5,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,1],ymm8[0,2],ymm7[7,5],ymm8[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,1],ymm0[0,2],ymm11[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm1[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1],ymm4[1,3],ymm2[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,2],ymm9[2,0],ymm11[4,6],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[1,0],ymm4[0,0],ymm3[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1],ymm5[0,2],ymm4[7,5],ymm5[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1],ymm6[1,3],ymm8[4,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2],ymm5[2,0],ymm9[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,0],ymm4[0,0],ymm13[7,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm9[1,3],ymm1[4,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[0,2],ymm8[2,0],ymm3[4,6],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,1],ymm0[0,2],ymm7[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,0],ymm14[2,0],ymm8[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm8[1,3],ymm1[4,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,2],ymm10[2,0],ymm12[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm4[0,0],ymm14[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm2[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,1],ymm13[1,3],ymm7[4,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,2],ymm10[2,0],ymm6[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,1],ymm0[0,2],ymm2[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm7[1,3],ymm5[4,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,2],ymm6[2,0],ymm15[4,6],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm0[0,0],ymm12[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm8[2,0],ymm1[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,0],ymm9[2,0],ymm3[5,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm3[0,0],ymm11[7,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[1,0],ymm11[2,0],ymm14[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,0],ymm10[2,0],ymm2[6,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm1[0,0],ymm9[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,0],ymm5[2,0],ymm15[5,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,0],ymm12[0,0],ymm15[7,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[1,0],ymm7[2,0],ymm5[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,0],ymm12[2,0],ymm1[6,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,0],ymm2[2,0],ymm14[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,1],ymm9[3,3],ymm3[6,5],ymm9[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm1[0],mem[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[1,0],ymm12[2,0],ymm15[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,1],ymm8[3,3],ymm3[6,5],ymm8[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0],xmm8[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0],xmm7[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,0],ymm15[1,0],ymm12[4,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm14[0,1],xmm9[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm12[1,0],ymm9[4,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, (%rsp), %xmm11, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[0,0],mem[1,0],ymm14[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[0,1],xmm13[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,1],ymm5[3,3],ymm6[6,5],ymm5[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[3,3],ymm0[6,5],ymm11[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,0],ymm0[2,0],ymm2[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm5[0,0],ymm6[1,0],ymm5[4,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[0,1],xmm1[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,0],ymm0[0,0],ymm4[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,0],ymm15[2,0],ymm12[5,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[0,0],ymm13[1,0],ymm5[4,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm12[2,0],ymm9[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm13[2,0],ymm5[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm1[0,0],ymm9[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm6[2,0],ymm5[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[0,0],ymm4[7,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -4057,7 +4055,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] @@ -4071,12 +4069,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4093,7 +4091,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -4108,14 +4106,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -4135,13 +4133,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4159,7 +4157,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] @@ -4172,20 +4170,21 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4197,114 +4196,114 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] ; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -4317,237 +4316,238 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm9[1,3],ymm11[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastd 548(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 660(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm10 +; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastd 772(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 884(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 664(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 440(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 888(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermd 640(%rdi), %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 528(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 752(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 304(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-SLOW-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -4555,24 +4555,25 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-LABEL: load_i32_stride7_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1192, %rsp # imm = 0x4A8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] @@ -4585,15 +4586,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm7[6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4605,13 +4604,11 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -4622,11 +4619,11 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4652,14 +4649,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4667,20 +4664,19 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm13[1],ymm8[2,3,4],ymm13[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4695,360 +4691,368 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] +; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm3[1],xmm14[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm15[0],ymm2[2],ymm15[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] ; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,2],ymm5[1,3],ymm14[4,6],ymm5[5,7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm10[0,2],ymm5[1,3],ymm10[4,6],ymm5[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm9 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm13[1,3],ymm12[4,6],ymm13[5,7] +; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm4[1],mem[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] -; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm8[1,3],ymm3[4,6],ymm8[5,7] +; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm4[1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm15[1,3],ymm2[4,6],ymm15[5,7] -; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm12[1,3],ymm10[4,6],ymm12[5,7] ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm11 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm10, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) ; AVX2-FAST-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -5072,7 +5076,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] @@ -5086,12 +5090,12 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -5108,7 +5112,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -5123,14 +5127,14 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5150,13 +5154,13 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5174,7 +5178,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] @@ -5187,20 +5191,21 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5212,114 +5217,114 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -5332,237 +5337,238 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm9[1,3],ymm11[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 548(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 660(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 772(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 884(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 664(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 440(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 888(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd 640(%rdi), %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 528(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 752(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 304(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -5972,48 +5978,48 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2456, %rsp # imm = 0x998 -; SSE-NEXT: movdqa 1088(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm3 +; SSE-NEXT: movdqa 1088(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1056(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1008(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1024(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm5 +; SSE-NEXT: movdqa 1024(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm13 +; SSE-NEXT: movdqa 608(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 560(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 576(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm15 ; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1456(%rdi), %xmm1 @@ -6030,635 +6036,632 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm14 -; SSE-NEXT: movdqa 48(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 496(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 912(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 976(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 976(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 944(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1360(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 1424(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 1424(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1392(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm11 +; SSE-NEXT: movdqa 336(%rdi), %xmm12 ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm8 -; SSE-NEXT: movdqa 384(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm4 +; SSE-NEXT: movdqa 384(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm13 -; SSE-NEXT: movdqa 832(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1232(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 832(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1232(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1248(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 1312(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1280(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 1312(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1680(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1280(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1680(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1696(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: movdqa 1760(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1728(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm8 ; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: movdqa 272(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm11 ; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa 752(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movdqa 752(%rdi), %xmm14 +; SSE-NEXT: movdqa 720(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1120(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1136(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1168(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1568(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: movdqa 1200(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1168(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1568(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1584(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa 1648(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: movdqa 1648(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1616(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1616(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm4[0],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa 704(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 480(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa 704(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 928(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1264(%rdi), %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa 1152(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1488(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 928(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1376(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1712(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1264(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 1600(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 1152(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1488(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1376(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1712(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1600(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 960(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1072(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1184(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1296(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 960(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 1072(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1408(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 1184(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 1296(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movdqa 1408(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movdqa 1520(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movdqa 1632(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movdqa 1744(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 880(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 992(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6667,46 +6670,42 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1104(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1328(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1440(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6717,7 +6716,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 1552(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6732,140 +6732,159 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1776(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6874,22 +6893,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6898,8 +6906,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -6908,60 +6915,48 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6970,8 +6965,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6980,25 +6975,26 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7007,8 +7003,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7017,8 +7013,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7027,25 +7022,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7053,24 +7057,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -7080,7 +7087,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -7088,22 +7096,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm14 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] @@ -7113,36 +7119,38 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] @@ -7175,17 +7183,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] @@ -7201,12 +7210,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -7428,12 +7437,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX1-ONLY-NEXT: subq $3176, %rsp # imm = 0xC68 ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 @@ -7442,12 +7451,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm13 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 @@ -7463,21 +7471,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] @@ -7490,14 +7497,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7521,7 +7529,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -7537,21 +7545,21 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7574,47 +7582,47 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1] -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7628,75 +7636,76 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0],xmm12[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,0],ymm1[3,3],ymm8[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm1[3,3],ymm3[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm11[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm10[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm9[1],xmm11[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 @@ -7713,13 +7722,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 @@ -7736,8 +7745,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm0[2,2],ymm15[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7759,12 +7767,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm0[2,2],ymm3[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm15[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 @@ -7774,406 +7783,405 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = zero,xmm3[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm9[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm4[3,3],ymm1[4,4],ymm4[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm0[2,2],ymm8[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0],xmm3[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm3[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm14[3,3],ymm1[4,4],ymm14[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm15[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm10[3,3],ymm1[4,4],ymm10[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm10[1,2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm12[0,3],ymm14[7,5],ymm12[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm5[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[2,1],ymm12[2,0],ymm6[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,0],ymm14[0,0],ymm12[5,4],ymm14[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1],ymm7[0,2],ymm14[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm6[0,3],ymm12[7,5],ymm6[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm12[0,0],ymm10[5,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm12[0,1],mem[1,3],ymm12[4,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,1],ymm8[1,3],ymm12[4,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,2],ymm15[2,0],ymm8[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm14[2,0],ymm12[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0],ymm14[0,0],ymm7[5,4],ymm14[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1],ymm7[0,2],ymm14[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm8[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,1],ymm4[1,3],ymm8[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,2],ymm14[2,0],ymm0[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0],ymm4[0,0],ymm7[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1],ymm7[0,2],ymm4[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm1[1,3],ymm4[4,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,2],ymm14[2,0],ymm15[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm12[0,0],ymm8[5,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm5[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,1],ymm1[1,3],ymm5[4,5],ymm1[5,7] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,0],ymm1[0,0],ymm4[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,1],ymm7[0,2],ymm1[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm13[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,2],ymm15[2,0],ymm1[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[0,0],ymm5[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1],ymm10[0,2],ymm1[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,1],ymm10[1,3],ymm1[4,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[0,2],ymm14[2,0],ymm11[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,1],ymm0[1,3],ymm1[4,5],ymm0[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,2],ymm15[2,0],ymm13[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm10[0,2],ymm0[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm9[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1],mem[1,3],ymm0[4,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,2],ymm15[2,0],ymm11[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm2[0,0],ymm5[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[0,0],ymm6[5,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[2,0],ymm6[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,2],ymm9[2,0],ymm7[4,6],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm10[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm15[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,1],ymm8[1,3],ymm9[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm5[2,0],ymm15[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,1],ymm8[1,3],ymm12[4,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[0,2],ymm7[2,0],ymm11[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm10[1,3],ymm4[4,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm5[2,0],ymm14[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,2],ymm7[2,0],ymm3[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm5[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1],ymm1[1,3],ymm11[4,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,2],ymm6[2,0],ymm2[4,6],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,2],ymm9[2,0],ymm2[4,6],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm5[0,0],ymm2[7,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[1,0],ymm1[2,0],ymm11[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,0],ymm0[6,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm0[0,0],ymm12[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm3[2,0],ymm13[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm10[2,0],ymm4[5,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,0],ymm0[0,0],ymm15[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm0[0,0],ymm11[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm8[2,0],ymm12[5,4],ymm8[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8181,112 +8189,91 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm8[2,0],ymm2[5,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm13[2,1],mem[3,3],ymm13[6,5],mem[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0],xmm10[1],xmm12[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[0,0],ymm15[1,0],ymm9[4,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,1],xmm11[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0],xmm9[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm3[2,0],ymm10[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0],xmm9[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,0],ymm14[1,0],ymm13[4,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm6[2,1],mem[3,3],ymm6[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,0],mem[1,0],ymm10[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm10[2,1],mem[3,3],ymm10[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[0,1],xmm7[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm11[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -8295,33 +8282,51 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm7[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0],ymm7[1,0],ymm6[4,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,1],xmm3[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] @@ -8333,12 +8338,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm5[0],mem[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -8351,10 +8356,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,1],ymm8[3,3],ymm2[6,5],ymm8[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8367,9 +8373,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,1],xmm0[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm0[0,0],ymm11[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] @@ -8377,34 +8383,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm15[2,0],ymm2[5,4],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,0],ymm1[0,0],ymm10[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 @@ -8413,6 +8419,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0],ymm1[0,0],ymm3[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] @@ -8433,91 +8458,70 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,0],ymm3[0,0],ymm6[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,0],ymm3[0,0],ymm7[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,0],ymm7[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[0,0],ymm5[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[3,0],ymm8[0,0],ymm9[7,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,0],ymm4[0,0],ymm7[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,0],ymm9[4,5],ymm4[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[0,0],ymm8[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,0],ymm10[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,0],ymm10[0,0],ymm5[7,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -8526,14 +8530,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -8616,25 +8620,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) +; AVX1-ONLY-NEXT: addq $3176, %rsp # imm = 0xC68 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2664, %rsp # imm = 0xA68 +; AVX2-SLOW-NEXT: subq $2680, %rsp # imm = 0xA78 ; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm6 @@ -8681,12 +8685,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -8698,10 +8703,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8738,14 +8744,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -8758,19 +8764,17 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 976(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %xmm3 @@ -8787,10 +8791,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %xmm2 @@ -8832,10 +8836,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] @@ -8847,20 +8851,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8876,15 +8880,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8902,11 +8905,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8924,10 +8927,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8945,8 +8950,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8963,17 +8967,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8994,48 +8997,46 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1324(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -9044,32 +9045,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -9080,75 +9080,77 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm5[1,3],ymm9[4,6],ymm5[5,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm3[1,3],ymm6[4,6],ymm3[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm8[1,3],ymm15[4,6],ymm8[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -9157,20 +9159,19 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm14[0,2],mem[1,3],ymm14[4,6],mem[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1776(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $253, (%rsp), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -9182,13 +9183,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm10[1,3],ymm11[4,6],ymm10[5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] +; AVX2-SLOW-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm11 ; AVX2-SLOW-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -9197,8 +9199,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9212,502 +9214,498 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm0 = mem[0,0] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm7 = mem[0,0] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm7, %ymm5 ; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm13[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm14, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rax) -; AVX2-SLOW-NEXT: addq $2664, %rsp # imm = 0xA68 +; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) +; AVX2-SLOW-NEXT: addq $2680, %rsp # imm = 0xA78 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2648, %rsp # imm = 0xA58 -; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm11 +; AVX2-FAST-NEXT: subq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -9718,12 +9716,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm15[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %xmm3 @@ -9734,13 +9733,15 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm11[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -9751,14 +9752,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 1664(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 1648(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -9772,9 +9775,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 @@ -9791,11 +9794,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9860,201 +9863,205 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] ; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %xmm2 @@ -10069,8 +10076,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -10079,34 +10085,34 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 @@ -10115,79 +10121,79 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm13 = ymm5[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm11[1],mem[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2],ymm4[1,3],ymm12[4,6],ymm4[5,7] ; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,2],ymm14[1,3],ymm12[4,6],ymm14[5,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] +; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm13 ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,2],ymm13[1,3],ymm11[4,6],ymm13[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm12[1,3],ymm14[4,6],ymm12[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] @@ -10197,8 +10203,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] @@ -10206,26 +10212,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] ; AVX2-FAST-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm10[1,3],ymm7[4,6],ymm10[5,7] -; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm10 ; AVX2-FAST-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] @@ -10234,7 +10239,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] ; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm2 @@ -10250,25 +10255,25 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3] -; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3] +; AVX2-FAST-NEXT: # xmm4 = mem[0,0] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vbroadcastss 212(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10276,341 +10281,342 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 324(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vbroadcastss 548(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vbroadcastss 884(%rdi), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 996(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vbroadcastss 1108(%rdi), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm8 ; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm12, %ymm15 +; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $34, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm14, %ymm12 +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 ; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm12[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm11[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm13[1],mem[2,3,4],ymm13[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = xmm14[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm5 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10687,42 +10693,43 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm12, 224(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm8, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm13, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, (%rax) -; AVX2-FAST-NEXT: addq $2648, %rsp # imm = 0xA58 +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rax) +; AVX2-FAST-NEXT: addq $2680, %rsp # imm = 0xA78 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2664, %rsp # imm = 0xA68 +; AVX2-FAST-PERLANE-NEXT: subq $2680, %rsp # imm = 0xA78 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm6 @@ -10769,12 +10776,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -10786,10 +10794,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10826,14 +10835,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] @@ -10846,19 +10855,17 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 976(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %xmm3 @@ -10875,10 +10882,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %xmm2 @@ -10920,10 +10927,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] @@ -10935,20 +10942,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10964,15 +10971,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10990,11 +10996,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -11012,10 +11018,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -11033,8 +11041,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11051,17 +11058,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11082,48 +11088,46 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1324(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -11132,32 +11136,31 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11168,75 +11171,77 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1352(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm5[1,3],ymm9[4,6],ymm5[5,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm3[1,3],ymm6[4,6],ymm3[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm8[1,3],ymm15[4,6],ymm8[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -11245,20 +11250,19 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,2],mem[1,3],ymm14[4,6],mem[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1776(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, (%rsp), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1552(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -11270,13 +11274,14 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm10[1,3],ymm11[4,6],ymm10[5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11285,8 +11290,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -11300,212 +11305,224 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm0 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] @@ -11516,590 +11533,571 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r9) +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2664, %rsp # imm = 0xA68 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $2680, %rsp # imm = 0xA78 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: load_i32_stride7_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm16 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm19 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm22, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm17, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm24, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm27, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 ; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm27, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm9, %zmm21, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,8,15,22,29,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <18,25,0,7,14,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm22 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm23 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm5, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm12 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm19, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-NEXT: movw $480, %ax # imm = 0x1E0 ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12109,60 +12107,60 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 +; AVX512F-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} ; AVX512F-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} @@ -12178,44 +12176,43 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm29, %zmm19 +; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm28, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm28, %zmm2 +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm7, %zmm28, %zmm7 +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rax) @@ -12223,10 +12220,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -12234,329 +12231,325 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i32_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,8,15,22,29,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <18,25,0,7,14,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm22 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm23 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm5, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm19, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: movw $480, %ax # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12566,60 +12559,60 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} ; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} @@ -12635,44 +12628,43 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm29, %zmm19 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm28, %zmm2 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm28, %zmm7 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) @@ -12680,10 +12672,10 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 7e7398050087c..25db5a66461e8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -487,100 +487,99 @@ define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i32_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 176(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm15 +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps 160(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm5 -; SSE-NEXT: movaps 224(%rdi), %xmm10 -; SSE-NEXT: movaps 192(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm11 +; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps 240(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; SSE-NEXT: movaps 208(%rdi), %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm15[1] -; SSE-NEXT: movaps 80(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movaps 16(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm6, 16(%rdx) -; SSE-NEXT: movaps %xmm14, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm11, (%r8) -; SSE-NEXT: movaps %xmm5, 16(%r8) +; SSE-NEXT: movaps %xmm5, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm10, (%r8) +; SSE-NEXT: movaps %xmm6, 16(%r8) ; SSE-NEXT: movaps %xmm4, (%r9) ; SSE-NEXT: movaps %xmm13, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, (%rax) -; SSE-NEXT: movaps %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movaps %xmm11, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, (%rax) -; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf8: @@ -590,87 +589,87 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm8[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,0],ymm7[1,0],ymm6[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[2,0],ymm8[2,3],ymm14[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[1,0],ymm7[1,0],ymm6[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[2,0],ymm8[2,3],ymm13[6,4],ymm8[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,0],ymm15[4,5],ymm13[6,4] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[3,0],ymm6[7,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm10[2,3],ymm6[6,4],ymm10[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -683,7 +682,7 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm12, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -698,7 +697,6 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i32_stride8_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 @@ -708,13 +706,13 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm5 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vbroadcastss %xmm9, %xmm6 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm6 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm5 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vbroadcastss %xmm9, %xmm5 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 @@ -723,11 +721,11 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] @@ -738,71 +736,70 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1,2],xmm11[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm15[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm9[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm13[0],ymm9[2],ymm13[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm9, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX2-ONLY-NEXT: vzeroupper @@ -899,96 +896,94 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $296, %rsp # imm = 0x128 -; SSE-NEXT: movaps 288(%rdi), %xmm1 -; SSE-NEXT: movaps 352(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm7 -; SSE-NEXT: movaps 416(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm11 -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps 128(%rdi), %xmm15 -; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movaps %xmm15, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm6[0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm5 +; SSE-NEXT: movaps 416(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 480(%rdi), %xmm13 +; SSE-NEXT: movaps 448(%rdi), %xmm4 +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps 128(%rdi), %xmm10 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 192(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: movaps %xmm11, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movaps 256(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps 96(%rdi), %xmm10 +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm14 +; SSE-NEXT: movaps 32(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1002,70 +997,70 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 400(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps 368(%rdi), %xmm12 +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 464(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 432(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 400(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm14 ; SSE-NEXT: movaps 336(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: movaps 304(%rdi), %xmm11 -; SSE-NEXT: movaps 272(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps 304(%rdi), %xmm12 +; SSE-NEXT: movaps 272(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm13 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: unpckhps (%rsp), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm15[1] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1102,271 +1097,273 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r9) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm5, (%rax) +; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movaps %xmm13, 32(%rax) -; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm6, (%rax) +; SSE-NEXT: movaps %xmm6, 48(%rax) +; SSE-NEXT: movaps %xmm7, 32(%rax) +; SSE-NEXT: movaps %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: addq $296, %rsp # imm = 0x128 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $584, %rsp # imm = 0x248 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[4],ymm6[4],ymm8[5],ymm6[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[4],ymm13[4],ymm14[5],ymm13[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,0],ymm6[1,0],ymm9[5,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm2[1,0],ymm0[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[4],ymm11[4],ymm15[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm8[1,0],ymm7[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[2,0],ymm6[2,3],ymm1[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,0],ymm1[1,0],ymm0[5,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm14[1,0],ymm13[5,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,0],ymm5[1,0],ymm7[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,0],ymm15[2,3],ymm2[6,4],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,0],ymm13[1,0],ymm12[5,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm7[2,0],ymm1[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1],ymm7[2,0],ymm2[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[6],ymm11[6],ymm15[7],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[3,0],ymm3[3,0],ymm2[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[2,3],ymm3[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0],ymm14[3,0],ymm13[7,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm4[3,0],ymm2[7,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm3[3,0],ymm1[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[2,3],ymm6[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm14[2],ymm5[3],ymm14[3],ymm5[6],ymm14[6],ymm5[7],ymm14[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0],ymm13[3,0],ymm12[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1393,254 +1390,255 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: addq $584, %rsp # imm = 0x248 +; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride8_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm8 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm9 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm0, %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps %xmm15, %xmm10 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps %xmm8, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps %xmm8, %xmm7 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm15[0],ymm6[0],ymm15[1],ymm6[1],ymm15[4],ymm6[4],ymm15[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[4],ymm4[4],ymm12[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm13[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm14[0],ymm7[0],ymm14[1],ymm7[1],ymm14[4],ymm7[4],ymm14[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[4],ymm0[4],ymm8[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm6[2],ymm15[3],ymm6[3],ymm15[6],ymm6[6],ymm15[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm14[2],ymm7[2],ymm14[3],ymm7[3],ymm14[6],ymm7[6],ymm14[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 @@ -1662,12 +1660,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1959,49 +1957,50 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride8_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $952, %rsp # imm = 0x3B8 -; SSE-NEXT: movaps 544(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm5 +; SSE-NEXT: movaps 544(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 672(%rdi), %xmm6 +; SSE-NEXT: movaps 608(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm7 +; SSE-NEXT: movaps 576(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm10 +; SSE-NEXT: movaps 672(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 640(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 736(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 704(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movaps 512(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2012,11 +2011,12 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 448(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 416(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 416(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2024,179 +2024,177 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 992(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps 960(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 928(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movaps 288(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 288(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 864(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 832(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 800(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 768(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm15 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: unpckhps (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 176(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm13 ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 304(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps 368(%rdi), %xmm4 +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps 304(%rdi), %xmm5 +; SSE-NEXT: movaps 272(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm7 +; SSE-NEXT: movaps 464(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movaps 432(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 400(%rdi), %xmm1 @@ -2207,16 +2205,16 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 560(%rdi), %xmm2 +; SSE-NEXT: movaps 624(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 528(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 592(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 560(%rdi), %xmm6 +; SSE-NEXT: movaps 528(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2224,13 +2222,13 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 752(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 720(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 720(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 688(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 656(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps 656(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -2239,116 +2237,118 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 848(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 848(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 816(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 784(%rdi), %xmm1 +; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 784(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 976(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps 976(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 944(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 912(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 912(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 16(%rdi), %xmm12 -; SSE-NEXT: movaps 48(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 16(%rdi), %xmm15 +; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm8[1] -; SSE-NEXT: movaps %xmm13, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movaps %xmm5, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2381,7 +2381,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) @@ -2430,7 +2430,8 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2439,50 +2440,55 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, 112(%rax) -; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm7, 80(%rax) -; SSE-NEXT: movaps %xmm8, 64(%rax) -; SSE-NEXT: movaps %xmm9, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps %xmm8, 96(%rax) +; SSE-NEXT: movaps %xmm4, 80(%rax) +; SSE-NEXT: movaps %xmm13, 64(%rax) +; SSE-NEXT: movaps %xmm12, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 112(%rax) +; SSE-NEXT: movaps %xmm2, 112(%rax) ; SSE-NEXT: movaps %xmm6, 96(%rax) -; SSE-NEXT: movaps %xmm11, 80(%rax) -; SSE-NEXT: movaps %xmm13, 64(%rax) -; SSE-NEXT: movaps %xmm14, 48(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm5, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps %xmm7, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm12, (%rax) +; SSE-NEXT: movaps %xmm15, (%rax) ; SSE-NEXT: addq $952, %rsp # imm = 0x3B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1736, %rsp # imm = 0x6C8 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; AVX1-ONLY-NEXT: subq $1800, %rsp # imm = 0x708 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2491,39 +2497,37 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 @@ -2541,242 +2545,241 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm8[0],ymm3[1],ymm8[1],ymm3[4],ymm8[4],ymm3[5],ymm8[5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm14 +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -2787,55 +2790,56 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2845,153 +2849,160 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,0],ymm13[4,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[4],ymm15[4],ymm2[5],ymm15[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[4],ymm3[4],ymm6[5],ymm3[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm12[1,0],ymm8[5,4],ymm12[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[4],ymm12[4],ymm9[5],ymm12[5] +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm5[1,0],ymm13[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[4],ymm14[4],ymm1[5],ymm14[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,0],ymm5[1,0],ymm3[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm8[1,0],ymm14[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[4],ymm11[4],ymm7[5],ymm11[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm9[1,0],ymm7[5,4],ymm9[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm9[1,0],ymm1[5,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm4[1,0],ymm6[5,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm2[1,0],ymm5[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[4],mem[4],ymm14[5],mem[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,0],ymm2[1,0],ymm14[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[6],ymm14[6],ymm8[7],ymm14[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[6],ymm15[6],ymm9[7],ymm15[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -2999,31 +3010,30 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[6],ymm6[6],ymm8[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,0],ymm3[3,0],ymm7[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm6[3,0],ymm8[7,4],ymm6[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -3033,24 +3043,25 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,0],ymm4[3,0],ymm10[7,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[3,0],ymm13[3,0],ymm10[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,0],ymm9[3,0],ymm15[7,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) @@ -3104,7 +3115,8 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3112,40 +3124,39 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX1-ONLY-NEXT: addq $1800, %rsp # imm = 0x708 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride8_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 +; AVX2-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm11 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm10 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm13 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm14 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm10 +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm15 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 @@ -3162,17 +3173,17 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3192,7 +3203,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -3217,46 +3228,46 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps %xmm10, %xmm3 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3265,29 +3276,30 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm8[0],mem[0],xmm8[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -3295,15 +3307,15 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] @@ -3313,147 +3325,147 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm12[2],mem[2],xmm12[3],mem[3] +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 @@ -3466,21 +3478,22 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3492,95 +3505,88 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm10 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm5 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm4 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovaps %ymm13, %ymm15 -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3589,2453 +3595,2410 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[6],ymm3[6],ymm11[7],ymm3[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm5 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm15 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm4[1],ymm10[3],ymm4[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r9) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512F-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: movb $-64, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQ-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i32_stride8_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: movb $-64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQ-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: movb $-64, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: movb $-64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm4, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm15, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm10, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm15, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm15, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm13, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm10, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,10,18,26,2,10,18,26] ; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm28, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm30, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm22, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm26, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-FAST-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 @@ -6062,16 +6025,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride8_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2232, %rsp # imm = 0x8B8 -; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 288(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm7 +; SSE-NEXT: movaps 416(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm8 +; SSE-NEXT: movaps 384(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 480(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm3 @@ -6092,8 +6055,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -6101,21 +6064,21 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movaps 256(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 256(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 736(%rdi), %xmm9 ; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movaps 672(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 640(%rdi), %xmm1 @@ -6126,16 +6089,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 544(%rdi), %xmm2 +; SSE-NEXT: movaps 608(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 512(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 576(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 544(%rdi), %xmm15 +; SSE-NEXT: movaps 512(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6158,14 +6122,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 864(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 832(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 800(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 800(%rdi), %xmm14 +; SSE-NEXT: movaps 768(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6173,8 +6137,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1248(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1216(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 1216(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1184(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6186,11 +6150,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1120(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1088(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1120(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1088(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 1056(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1024(%rdi), %xmm1 @@ -6203,8 +6168,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1504(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1472(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 1472(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1440(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6218,8 +6183,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1376(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1344(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 1344(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1312(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6233,8 +6198,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1760(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1728(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 1728(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1696(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6248,14 +6213,15 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1632(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1600(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps 1600(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1568(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1536(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 1568(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1536(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6263,8 +6229,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2016(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1984(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps 1984(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1952(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6278,236 +6244,239 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1888(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1856(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps 1856(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1824(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1792(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1824(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1792(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 32(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps (%rsp), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm14[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm14 -; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps 176(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdi), %xmm1 @@ -6519,70 +6488,64 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm3 ; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 304(%rdi), %xmm4 ; SSE-NEXT: movaps 272(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm5 ; SSE-NEXT: movaps 464(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 432(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 432(%rdi), %xmm6 ; SSE-NEXT: movaps 400(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 624(%rdi), %xmm9 ; SSE-NEXT: movaps 592(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 560(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps 560(%rdi), %xmm10 ; SSE-NEXT: movaps 528(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 752(%rdi), %xmm12 ; SSE-NEXT: movaps 720(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 688(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 656(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movaps 688(%rdi), %xmm13 +; SSE-NEXT: movaps 656(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 880(%rdi), %xmm12 +; SSE-NEXT: movaps 880(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 848(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 816(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 784(%rdi), %xmm1 @@ -6593,11 +6556,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1008(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 976(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1008(%rdi), %xmm14 +; SSE-NEXT: movaps 976(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movaps 944(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 912(%rdi), %xmm1 @@ -6610,8 +6572,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1136(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1104(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps 1104(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1072(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6623,11 +6585,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1264(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1232(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1264(%rdi), %xmm11 +; SSE-NEXT: movaps 1232(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movaps 1200(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1168(%rdi), %xmm1 @@ -6640,8 +6601,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1392(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1360(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps 1360(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1328(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6655,8 +6616,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1520(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1488(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 1488(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1456(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6670,24 +6631,23 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1648(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1616(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 1616(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1584(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1552(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1776(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1744(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1776(%rdi), %xmm8 +; SSE-NEXT: movaps 1744(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movaps 1712(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1680(%rdi), %xmm1 @@ -6700,8 +6660,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1904(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1872(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps 1872(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1840(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6715,225 +6675,234 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2032(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 2000(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps 2000(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1968(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1968(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1936(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movaps 16(%rdi), %xmm13 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps (%rsp), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm11[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm15[1] -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm9[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps %xmm13, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 240(%rsi) @@ -7124,44 +7093,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, 240(%rax) ; SSE-NEXT: movaps %xmm4, 224(%rax) ; SSE-NEXT: movaps %xmm5, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) -; SSE-NEXT: movaps %xmm12, 176(%rax) -; SSE-NEXT: movaps %xmm8, 160(%rax) -; SSE-NEXT: movaps %xmm9, 144(%rax) -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps %xmm11, 112(%rax) -; SSE-NEXT: movaps %xmm14, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm6, (%rax) +; SSE-NEXT: movaps %xmm2, 192(%rax) +; SSE-NEXT: movaps %xmm6, 176(%rax) +; SSE-NEXT: movaps %xmm7, 160(%rax) +; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movaps %xmm14, 128(%rax) +; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps %xmm9, 240(%rax) +; SSE-NEXT: movaps %xmm8, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps %xmm10, 192(%rax) +; SSE-NEXT: movaps %xmm12, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -7171,18 +7139,18 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps %xmm11, (%rax) ; SSE-NEXT: addq $2232, %rsp # imm = 0x8B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3688, %rsp # imm = 0xE68 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $3720, %rsp # imm = 0xE88 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 @@ -7190,18 +7158,19 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -7223,9 +7192,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 @@ -7235,11 +7204,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7250,24 +7219,25 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7280,9 +7250,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm3 @@ -7291,11 +7261,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7306,18 +7276,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 @@ -7332,92 +7301,105 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm6[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] @@ -7425,99 +7407,85 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7538,8 +7506,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7548,10 +7515,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7559,8 +7526,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7577,11 +7545,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7590,19 +7557,19 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] @@ -7613,146 +7580,147 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm7[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm2[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7765,11 +7733,12 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -7777,10 +7746,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm13[2],mem[2],xmm13[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -7791,50 +7761,50 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[4],ymm1[4],ymm11[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[4],ymm12[4],ymm8[5],ymm12[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] @@ -7851,11 +7821,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7869,14 +7839,14 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7905,664 +7875,668 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[4],ymm2[4],ymm13[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm1[2,0],ymm8[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[4],mem[4],ymm12[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm11[1,0],ymm1[5,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm5[1,0],ymm1[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm15[1,0],ymm1[5,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm15 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[4],ymm15[4],ymm9[5],ymm15[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,0],ymm2[1,0],ymm6[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,0],ymm8[1,0],ymm12[5,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm11[1,0],ymm7[5,4],ymm11[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm5[1,0],mem[1,0],ymm5[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[4],ymm5[4],ymm12[5],ymm5[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm7[1,0],ymm10[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm13[1,0],ymm1[5,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[1,0],ymm8[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[4],mem[4],ymm8[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[1,0],ymm8[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[4],mem[4],ymm8[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm9[1],ymm15[3],ymm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[3,0],ymm13[3,0],ymm11[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[3,0],ymm9[3,0],ymm7[7,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,0],ymm15[3,0],ymm1[7,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[3,0],ymm9[3,0],ymm7[7,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm0[2,3],ymm8[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[3,0],ymm11[3,0],ymm13[7,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rax) -; AVX1-ONLY-NEXT: addq $3688, %rsp # imm = 0xE68 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $3720, %rsp # imm = 0xE88 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: subq $3528, %rsp # imm = 0xDC8 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -8572,9 +8546,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm13 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 @@ -8588,19 +8562,18 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 @@ -8622,9 +8595,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8652,9 +8625,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8693,11 +8666,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -8710,11 +8683,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8740,11 +8713,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8753,56 +8726,57 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps %xmm13, %xmm9 +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8816,12 +8790,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8831,16 +8806,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8850,16 +8826,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8869,29 +8846,32 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8905,13 +8885,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm10[0],mem[0],xmm10[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8925,13 +8905,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm9[0],mem[0],xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8942,145 +8922,145 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm9[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -9104,7 +9084,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -9120,10 +9100,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 @@ -9133,11 +9113,9 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm5[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] @@ -9151,8 +9129,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm5[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] @@ -9163,12 +9140,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = mem[2,3,2,3] @@ -9176,10 +9154,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -9189,8 +9167,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -9216,12 +9193,11 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9233,16 +9209,16 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm10 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 @@ -9261,22 +9237,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9288,22 +9264,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9315,22 +9291,22 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9340,199 +9316,185 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[4],ymm1[4],ymm11[5],ymm1[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[4],ymm1[4],ymm6[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm9 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm11 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovaps %ymm12, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm14 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1172(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1428(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX2-ONLY-NEXT: vpermilps $85, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1684(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1172(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1428(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1940(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastss 1684(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9545,15 +9507,27 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 1940(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload @@ -9562,113 +9536,114 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[2,2,2,2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1272(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1528(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1784(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 2040(%rdi), %ymm0 @@ -9678,17 +9653,17 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -9732,5000 +9707,1402 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm1 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1244(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1500(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 1756(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 2012(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3464, %rsp # imm = 0xD88 +; AVX2-ONLY-NEXT: addq $3528, %rsp # imm = 0xDC8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf64: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq -; -; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf64: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: movb $-64, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf64: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: movb $-64, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: load_i32_stride8_vf64: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: movb $-64, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq -; -; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf64: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf64: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf64: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: movb $-64, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq +; AVX512F-LABEL: load_i32_stride8_vf64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512F-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: movb $-64, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm9 +; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf64: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: movb $-64, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq +; AVX512BW-LABEL: load_i32_stride8_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm7 +; AVX512BW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 +; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %wide.vec = load <512 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> %strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll index fb01a0ad31557..8fa1dd6a30b52 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -246,35 +246,35 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm9 +; SSE-NEXT: movaps 208(%rdi), %xmm11 ; SSE-NEXT: movaps 192(%rdi), %xmm2 ; SSE-NEXT: movaps 240(%rdi), %xmm10 ; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm14 ; SSE-NEXT: movaps 128(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm13 +; SSE-NEXT: movaps 176(%rdi), %xmm12 ; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps 80(%rdi), %xmm14 +; SSE-NEXT: movaps 80(%rdi), %xmm13 ; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps 112(%rdi), %xmm15 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] ; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm10[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] @@ -284,12 +284,12 @@ define void @load_i64_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movaps %xmm13, 96(%rsi) -; SSE-NEXT: movaps %xmm9, 112(%rsi) +; SSE-NEXT: movaps %xmm12, 96(%rsi) +; SSE-NEXT: movaps %xmm11, 112(%rsi) ; SSE-NEXT: movaps %xmm15, 64(%rsi) -; SSE-NEXT: movaps %xmm12, 80(%rsi) -; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps %xmm14, 48(%rsi) +; SSE-NEXT: movaps %xmm14, 80(%rsi) +; SSE-NEXT: movaps %xmm9, 32(%rsi) +; SSE-NEXT: movaps %xmm13, 48(%rsi) ; SSE-NEXT: movaps %xmm10, (%rsi) ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps %xmm4, 112(%rdx) @@ -404,61 +404,61 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i64_stride2_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 272(%rdi), %xmm9 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 304(%rdi), %xmm12 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 272(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm9 +; SSE-NEXT: movaps 304(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm13 +; SSE-NEXT: movaps 176(%rdi), %xmm4 +; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm14[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 352(%rdi), %xmm15 @@ -468,46 +468,46 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 336(%rdi), %xmm0 ; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movaps 416(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movaps 384(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps 480(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm0 +; SSE-NEXT: movaps 480(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm3 -; SSE-NEXT: movaps 448(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps %xmm12, 160(%rsi) +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, 224(%rsi) +; SSE-NEXT: movaps %xmm11, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm5, 240(%rsi) +; SSE-NEXT: movaps %xmm6, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -519,17 +519,17 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 128(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps %xmm14, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 224(%rdx) -; SSE-NEXT: movaps %xmm4, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm9, 208(%rdx) +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 224(%rdx) +; SSE-NEXT: movaps %xmm7, 240(%rdx) +; SSE-NEXT: movaps %xmm9, 192(%rdx) +; SSE-NEXT: movaps %xmm12, 208(%rdx) ; SSE-NEXT: movaps %xmm13, 160(%rdx) ; SSE-NEXT: movaps %xmm15, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -548,141 +548,141 @@ define void @load_i64_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm12[0],ymm7[2],ymm12[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm14[1],ymm5[3],ymm14[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm13[1],ymm6[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride2_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm5[0],ymm8[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm5[1],ymm15[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm6[1],ymm14[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -732,62 +732,62 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-LABEL: load_i64_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps 304(%rdi), %xmm10 -; SSE-NEXT: movaps 272(%rdi), %xmm8 -; SSE-NEXT: movaps 256(%rdi), %xmm6 -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 224(%rdi), %xmm7 -; SSE-NEXT: movaps 208(%rdi), %xmm1 -; SSE-NEXT: movaps 192(%rdi), %xmm9 -; SSE-NEXT: movaps 176(%rdi), %xmm2 +; SSE-NEXT: movaps 304(%rdi), %xmm0 +; SSE-NEXT: movaps 272(%rdi), %xmm1 +; SSE-NEXT: movaps 256(%rdi), %xmm8 +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps 224(%rdi), %xmm9 +; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps 160(%rdi), %xmm11 -; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps 144(%rdi), %xmm5 ; SSE-NEXT: movaps 128(%rdi), %xmm12 -; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 80(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm7 ; SSE-NEXT: movaps 64(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: movaps %xmm13, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 336(%rdi), %xmm0 ; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -901,52 +901,52 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 848(%rdi), %xmm0 -; SSE-NEXT: movaps 832(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 832(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm0 -; SSE-NEXT: movaps 864(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 912(%rdi), %xmm4 -; SSE-NEXT: movaps 896(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] -; SSE-NEXT: movaps 944(%rdi), %xmm4 +; SSE-NEXT: movaps 864(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 912(%rdi), %xmm0 +; SSE-NEXT: movaps 896(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 944(%rdi), %xmm0 ; SSE-NEXT: movaps 928(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; SSE-NEXT: movaps 976(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 976(%rdi), %xmm0 ; SSE-NEXT: movaps 960(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps 1008(%rdi), %xmm4 -; SSE-NEXT: movaps 992(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 1008(%rdi), %xmm3 +; SSE-NEXT: movaps 992(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: movaps (%rdi), %xmm14 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm14, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] +; SSE-NEXT: movaps 32(%rdi), %xmm11 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, 496(%rsi) -; SSE-NEXT: movaps %xmm3, 480(%rsi) -; SSE-NEXT: movaps %xmm5, 464(%rsi) -; SSE-NEXT: movaps %xmm7, 448(%rsi) -; SSE-NEXT: movaps %xmm11, 432(%rsi) -; SSE-NEXT: movaps %xmm10, 416(%rsi) +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, 496(%rsi) +; SSE-NEXT: movaps %xmm1, 480(%rsi) +; SSE-NEXT: movaps %xmm2, 464(%rsi) +; SSE-NEXT: movaps %xmm9, 448(%rsi) +; SSE-NEXT: movaps %xmm7, 432(%rsi) +; SSE-NEXT: movaps %xmm12, 416(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -995,14 +995,15 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps %xmm2, 496(%rdx) +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm5, 496(%rdx) ; SSE-NEXT: movaps %xmm6, 480(%rdx) ; SSE-NEXT: movaps %xmm8, 464(%rdx) -; SSE-NEXT: movaps %xmm9, 448(%rdx) -; SSE-NEXT: movaps %xmm12, 432(%rdx) -; SSE-NEXT: movaps %xmm15, 416(%rdx) +; SSE-NEXT: movaps %xmm13, 448(%rdx) +; SSE-NEXT: movaps %xmm15, 432(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rdx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1051,126 +1052,125 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps %xmm13, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps %xmm14, (%rdx) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm9, %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm4, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm9[0],ymm15[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm9[1],ymm15[3],ymm9[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) @@ -1181,14 +1181,14 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 352(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) @@ -1213,167 +1213,167 @@ define void @load_i64_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; AVX2-ONLY-LABEL: load_i64_stride2_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovups %ymm12, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX2-ONLY-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll index 88e8d2cff874c..ac1be65682a7b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -364,11 +364,11 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 272(%rdi), %xmm3 ; SSE-NEXT: movapd 80(%rdi), %xmm2 ; SSE-NEXT: movapd 96(%rdi), %xmm5 -; SSE-NEXT: movapd 112(%rdi), %xmm11 +; SSE-NEXT: movapd 112(%rdi), %xmm12 ; SSE-NEXT: movapd 144(%rdi), %xmm6 ; SSE-NEXT: movapd 160(%rdi), %xmm14 ; SSE-NEXT: movapd 192(%rdi), %xmm7 -; SSE-NEXT: movapd 208(%rdi), %xmm12 +; SSE-NEXT: movapd 208(%rdi), %xmm11 ; SSE-NEXT: movapd 240(%rdi), %xmm10 ; SSE-NEXT: movapd 256(%rdi), %xmm13 ; SSE-NEXT: movapd 48(%rdi), %xmm9 @@ -385,66 +385,66 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm11, %xmm14 +; SSE-NEXT: movapd %xmm12, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] +; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm3[0] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm7[0],xmm13[1] +; SSE-NEXT: movapd %xmm11, %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm11[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 336(%rdi), %xmm12 -; SSE-NEXT: movapd 352(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] -; SSE-NEXT: movapd 368(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: movapd 288(%rdi), %xmm2 -; SSE-NEXT: movapd 304(%rdi), %xmm5 +; SSE-NEXT: movapd 336(%rdi), %xmm13 +; SSE-NEXT: movapd 352(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm13[0],xmm7[1] +; SSE-NEXT: movapd 368(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd 288(%rdi), %xmm0 +; SSE-NEXT: movapd 304(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd 320(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] +; SSE-NEXT: movapd (%rdi), %xmm2 +; SSE-NEXT: movapd 16(%rdi), %xmm5 ; SSE-NEXT: movapd %xmm5, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd 320(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd (%rdi), %xmm5 -; SSE-NEXT: movapd 16(%rdi), %xmm8 -; SSE-NEXT: movapd %xmm8, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: movapd 32(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm3, 96(%rsi) +; SSE-NEXT: movapd 32(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd %xmm1, 96(%rsi) ; SSE-NEXT: movapd %xmm14, 32(%rsi) ; SSE-NEXT: movapd %xmm7, 112(%rsi) ; SSE-NEXT: movapd %xmm15, 48(%rsi) -; SSE-NEXT: movapd %xmm13, 64(%rsi) -; SSE-NEXT: movapd %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm11, 80(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movapd %xmm2, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm12, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movapd %xmm5, (%rdx) +; SSE-NEXT: movapd %xmm8, 64(%rsi) +; SSE-NEXT: movapd %xmm3, (%rsi) +; SSE-NEXT: movapd %xmm12, 80(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movapd %xmm0, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movapd %xmm13, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movapd %xmm2, (%rdx) ; SSE-NEXT: movapd %xmm10, 80(%rdx) ; SSE-NEXT: movapd %xmm9, 16(%rdx) -; SSE-NEXT: movapd %xmm0, 96(%rcx) -; SSE-NEXT: movapd %xmm4, 112(%rcx) +; SSE-NEXT: movapd %xmm6, 96(%rcx) +; SSE-NEXT: movapd %xmm11, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -453,7 +453,7 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movapd %xmm1, (%rcx) +; SSE-NEXT: movapd %xmm4, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: addq $24, %rsp @@ -464,55 +464,55 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm5[1],ymm2[0],ymm5[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[2] ; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm7[1],ymm1[0],ymm7[3],ymm1[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[2] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm6[1],ymm0[0],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] ; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm11, 64(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -639,56 +639,56 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 272(%rdi), %xmm3 -; SSE-NEXT: movapd 128(%rdi), %xmm1 -; SSE-NEXT: movapd 176(%rdi), %xmm5 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 96(%rdi), %xmm6 -; SSE-NEXT: movapd 112(%rdi), %xmm11 -; SSE-NEXT: movapd 144(%rdi), %xmm7 -; SSE-NEXT: movapd 160(%rdi), %xmm12 -; SSE-NEXT: movapd 192(%rdi), %xmm8 -; SSE-NEXT: movapd 208(%rdi), %xmm13 -; SSE-NEXT: movapd 240(%rdi), %xmm9 -; SSE-NEXT: movapd 256(%rdi), %xmm2 -; SSE-NEXT: movapd 48(%rdi), %xmm10 -; SSE-NEXT: movapd 64(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] +; SSE-NEXT: movapd 224(%rdi), %xmm6 +; SSE-NEXT: movapd 272(%rdi), %xmm9 +; SSE-NEXT: movapd 128(%rdi), %xmm5 +; SSE-NEXT: movapd 176(%rdi), %xmm8 +; SSE-NEXT: movapd 80(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm10 +; SSE-NEXT: movapd 112(%rdi), %xmm0 +; SSE-NEXT: movapd 144(%rdi), %xmm11 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd 192(%rdi), %xmm12 +; SSE-NEXT: movapd 208(%rdi), %xmm2 +; SSE-NEXT: movapd 240(%rdi), %xmm13 +; SSE-NEXT: movapd 256(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm14 +; SSE-NEXT: movapd 64(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm7[0] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm4[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm11, %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm12[0],xmm5[1] +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm5[0] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm0[0] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm8[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm3[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm6[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm2 ; SSE-NEXT: movapd 304(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -740,80 +740,80 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 528(%rdi), %xmm15 -; SSE-NEXT: movapd 544(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 560(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 576(%rdi), %xmm11 +; SSE-NEXT: movapd 544(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 560(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 576(%rdi), %xmm12 ; SSE-NEXT: movapd 592(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm12[0],xmm14[1] ; SSE-NEXT: movapd 608(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 624(%rdi), %xmm8 ; SSE-NEXT: movapd 640(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] -; SSE-NEXT: movapd 656(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm12[0] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movapd 672(%rdi), %xmm0 -; SSE-NEXT: movapd 688(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm0, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd 656(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd 672(%rdi), %xmm6 +; SSE-NEXT: movapd 688(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] ; SSE-NEXT: movapd 704(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] -; SSE-NEXT: movapd 720(%rdi), %xmm2 -; SSE-NEXT: movapd 736(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm10[0] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd 720(%rdi), %xmm4 +; SSE-NEXT: movapd 736(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] ; SSE-NEXT: movapd 752(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm7[0] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; SSE-NEXT: movapd (%rdi), %xmm4 -; SSE-NEXT: movapd 16(%rdi), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] +; SSE-NEXT: movapd (%rdi), %xmm2 +; SSE-NEXT: movapd 16(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd 32(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] -; SSE-NEXT: movapd %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm9[0] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm5, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movapd %xmm3, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movapd %xmm14, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movapd %xmm5, (%rsi) -; SSE-NEXT: movapd %xmm13, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movapd %xmm0, 224(%rdx) -; SSE-NEXT: movapd %xmm2, 240(%rdx) -; SSE-NEXT: movapd %xmm11, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movapd %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm11, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movapd %xmm6, 224(%rdx) +; SSE-NEXT: movapd %xmm4, 240(%rdx) +; SSE-NEXT: movapd %xmm12, 192(%rdx) ; SSE-NEXT: movapd %xmm8, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rdx) @@ -834,12 +834,12 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movapd %xmm4, (%rdx) +; SSE-NEXT: movapd %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movapd %xmm7, 240(%rcx) ; SSE-NEXT: movapd %xmm10, 224(%rcx) -; SSE-NEXT: movapd %xmm12, 208(%rcx) +; SSE-NEXT: movapd %xmm13, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -870,241 +870,233 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm1 +; AVX1-ONLY-NEXT: subq $232, %rsp +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[2] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[2] +; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[2] +; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm7[1],ymm6[0],ymm7[3],ymm6[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm3[1],ymm8[0],ymm3[3],ymm8[2] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm4[1],ymm5[0],ymm4[3],ymm5[2] -; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0],ymm10[1],ymm0[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm9[0],ymm4[3],ymm9[2] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[1],ymm1[0],ymm5[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm2[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2],ymm2[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm6[1],ymm11[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm13[1],ymm6[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm13[1],mem[2],ymm13[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm8[1],ymm13[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2],ymm13[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm3[1],mem[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm8[1],mem[2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 192(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 224(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 128(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 128(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm15, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 224(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 224(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%rcx) +; AVX1-ONLY-NEXT: addq $232, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $232, %rsp -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm5[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] @@ -1112,63 +1104,63 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 32(%rcx) ; AVX2-ONLY-NEXT: addq $232, %rsp ; AVX2-ONLY-NEXT: vzeroupper @@ -1252,56 +1244,56 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movapd 272(%rdi), %xmm1 -; SSE-NEXT: movapd 224(%rdi), %xmm2 -; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 128(%rdi), %xmm4 +; SSE-NEXT: movapd 272(%rdi), %xmm9 +; SSE-NEXT: movapd 224(%rdi), %xmm8 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 128(%rdi), %xmm6 ; SSE-NEXT: movapd 80(%rdi), %xmm5 -; SSE-NEXT: movapd 240(%rdi), %xmm6 -; SSE-NEXT: movapd 256(%rdi), %xmm11 -; SSE-NEXT: movapd 192(%rdi), %xmm7 -; SSE-NEXT: movapd 208(%rdi), %xmm12 -; SSE-NEXT: movapd 144(%rdi), %xmm8 -; SSE-NEXT: movapd 160(%rdi), %xmm13 -; SSE-NEXT: movapd 96(%rdi), %xmm9 -; SSE-NEXT: movapd 112(%rdi), %xmm14 -; SSE-NEXT: movapd 48(%rdi), %xmm10 -; SSE-NEXT: movapd 64(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] +; SSE-NEXT: movapd 240(%rdi), %xmm10 +; SSE-NEXT: movapd 256(%rdi), %xmm0 +; SSE-NEXT: movapd 192(%rdi), %xmm11 +; SSE-NEXT: movapd 208(%rdi), %xmm1 +; SSE-NEXT: movapd 144(%rdi), %xmm12 +; SSE-NEXT: movapd 160(%rdi), %xmm2 +; SSE-NEXT: movapd 96(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm14 +; SSE-NEXT: movapd 64(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm5[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm5[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm4[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm14[0],xmm4[1] +; SSE-NEXT: movapd %xmm3, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm13[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm6[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm7[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1] +; SSE-NEXT: movapd %xmm1, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm11, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm8[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm2 ; SSE-NEXT: movapd 304(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -1504,122 +1496,121 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1248(%rdi), %xmm2 ; SSE-NEXT: movapd 1264(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] +; SSE-NEXT: movapd %xmm0, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: movapd 1280(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1296(%rdi), %xmm14 +; SSE-NEXT: movapd 1296(%rdi), %xmm15 ; SSE-NEXT: movapd 1312(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm15[0],xmm11[1] ; SSE-NEXT: movapd 1328(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1344(%rdi), %xmm12 ; SSE-NEXT: movapd 1360(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm12[0],xmm8[1] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] ; SSE-NEXT: movapd 1376(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd 1392(%rdi), %xmm9 -; SSE-NEXT: movapd 1408(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd 1424(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1440(%rdi), %xmm4 -; SSE-NEXT: movapd 1456(%rdi), %xmm7 -; SSE-NEXT: movapd %xmm7, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd 1472(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1488(%rdi), %xmm1 -; SSE-NEXT: movapd 1504(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: movapd 1392(%rdi), %xmm10 +; SSE-NEXT: movapd 1408(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] +; SSE-NEXT: movapd 1424(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1440(%rdi), %xmm9 +; SSE-NEXT: movapd 1456(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd 1472(%rdi), %xmm3 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1488(%rdi), %xmm0 +; SSE-NEXT: movapd 1504(%rdi), %xmm8 +; SSE-NEXT: movapd %xmm8, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd 1520(%rdi), %xmm13 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm13[0] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd (%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] +; SSE-NEXT: movapd (%rdi), %xmm8 ; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm10[0],xmm7[1] -; SSE-NEXT: movapd 32(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm5 -; SSE-NEXT: movapd %xmm6, 496(%rsi) -; SSE-NEXT: movapd %xmm2, 480(%rsi) -; SSE-NEXT: movapd %xmm3, 464(%rsi) -; SSE-NEXT: movapd %xmm8, 448(%rsi) +; SSE-NEXT: movapd %xmm5, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] +; SSE-NEXT: movapd 32(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd %xmm3, 496(%rsi) +; SSE-NEXT: movapd %xmm1, 480(%rsi) +; SSE-NEXT: movapd %xmm2, 464(%rsi) +; SSE-NEXT: movapd %xmm7, 448(%rsi) ; SSE-NEXT: movapd %xmm11, 432(%rsi) -; SSE-NEXT: movapd %xmm15, 416(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 368(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 320(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 304(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 288(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 272(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 256(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm7, (%rsi) -; SSE-NEXT: movapd %xmm1, 496(%rdx) -; SSE-NEXT: movapd %xmm4, 480(%rdx) -; SSE-NEXT: movapd %xmm9, 464(%rdx) +; SSE-NEXT: movapd %xmm14, 416(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 400(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 384(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 368(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 352(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 336(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 320(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 304(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 288(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 272(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 256(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movapd %xmm6, (%rsi) +; SSE-NEXT: movapd %xmm0, 496(%rdx) +; SSE-NEXT: movapd %xmm9, 480(%rdx) +; SSE-NEXT: movapd %xmm10, 464(%rdx) ; SSE-NEXT: movapd %xmm12, 448(%rdx) -; SSE-NEXT: movapd %xmm14, 432(%rdx) +; SSE-NEXT: movapd %xmm15, 432(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1672,7 +1663,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm10, (%rdx) +; SSE-NEXT: movapd %xmm8, (%rdx) ; SSE-NEXT: movapd %xmm13, 496(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rcx) @@ -1734,59 +1725,62 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm5, (%rcx) +; SSE-NEXT: movapd %xmm4, (%rcx) ; SSE-NEXT: addq $1176, %rsp # imm = 0x498 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1128, %rsp # imm = 0x468 +; AVX1-ONLY-NEXT: subq $1096, %rsp # imm = 0x448 ; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm4[0],ymm5[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[2] ; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm3[0],ymm6[3],ymm3[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[2] ; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm2[0],ymm7[3],ymm2[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[2] ; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm9[0],ymm8[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[2] ; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm11[0],ymm10[3],ymm11[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] ; AVX1-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm11[0],ymm0[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2] ; AVX1-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1797,10 +1791,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm5[0],ymm0[3],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2] ; AVX1-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1812,177 +1806,179 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[2] ; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2] ; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm4[0],ymm12[3],ymm4[2] -; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm3[0],ymm6[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm2[0],ymm15[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm14[1],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm1[0],ymm3[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $5, (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2],ymm9[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm0[1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0],ymm7[1],ymm1[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0],ymm7[1],mem[2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0],ymm9[1],mem[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovapd %ymm5, 448(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 384(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 256(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm1[1],ymm6[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0],ymm1[1],ymm11[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0],ymm9[1],ymm4[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm4[1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0],ymm11[1],mem[2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm11[0],ymm13[1],ymm11[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovapd %ymm7, 448(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 384(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 320(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 256(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 128(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 480(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 416(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 352(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 416(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2025,21 +2021,19 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 192(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) @@ -2053,7 +2047,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1128, %rsp # imm = 0x468 +; AVX1-ONLY-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2107,182 +2101,182 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,3,2,3] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,3,2,3] +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,3,2,3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] @@ -2440,138 +2434,124 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i64_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $200, %rsp -; AVX512-NEXT: vmovaps 1472(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm16 -; AVX512-NEXT: vmovaps 1280(%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm15 -; AVX512-NEXT: vmovaps 1088(%rdi), %zmm1 -; AVX512-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm22 -; AVX512-NEXT: vmovaps 896(%rdi), %zmm1 -; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,3,6,9,12,15,u,u> +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,3,6,9,12,15,u,u> ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512-NEXT: vpermt2q %zmm20, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512-NEXT: vpermt2q %zmm23, %zmm13, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-NEXT: vpermt2q %zmm12, %zmm13, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512-NEXT: vpermt2q %zmm17, %zmm13, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512-NEXT: vpermt2q %zmm14, %zmm13, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = <1,4,7,10,13,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm29 +; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm18 +; AVX512-NEXT: vpermt2q %zmm12, %zmm11, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = <1,4,7,10,13,u,u,u> +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = <10,13,0,3,6,u,u,u> -; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512-NEXT: vpermt2q %zmm12, %zmm26, %zmm30 -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm30 +; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 +; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm29 +; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512-NEXT: vpermt2q %zmm23, %zmm26, %zmm28 -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512-NEXT: vpermt2q %zmm18, %zmm26, %zmm27 -; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512-NEXT: vpermt2q %zmm17, %zmm26, %zmm22 -; AVX512-NEXT: vpermt2q %zmm21, %zmm31, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512-NEXT: vpermt2q %zmm14, %zmm26, %zmm21 -; AVX512-NEXT: vpermt2q %zmm16, %zmm31, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm16 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512-NEXT: vpermi2q %zmm15, %zmm19, %zmm13 -; AVX512-NEXT: vpermi2q %zmm15, %zmm19, %zmm26 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm15 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm28 +; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm27 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm26 +; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm22 +; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm11 +; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm9 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm24 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm25 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm16 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm17 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm18 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm16 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm29 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm28 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm27 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm22 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm21 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm26 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm29 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm28 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm19 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm23 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm18 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm17 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm10 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm24, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm27, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm28, 128(%rdx) +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm13 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm20, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm26, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm27, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm28, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm29, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm30, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm12, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm13, 384(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm23, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512-NEXT: addq $200, %rsp +; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll index 5ab0db77a9a06..1b508c764cc8c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -256,56 +256,56 @@ define void @load_i64_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movaps 240(%rdi), %xmm5 ; SSE-NEXT: movaps 208(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm6 +; SSE-NEXT: movaps 176(%rdi), %xmm9 ; SSE-NEXT: movaps 144(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm10 +; SSE-NEXT: movaps 112(%rdi), %xmm11 ; SSE-NEXT: movaps 80(%rdi), %xmm2 ; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: movaps 224(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 160(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rdi), %xmm15 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] +; SSE-NEXT: movaps 32(%rdi), %xmm13 +; SSE-NEXT: movaps 224(%rdi), %xmm14 +; SSE-NEXT: movaps 192(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm12 +; SSE-NEXT: movaps 64(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps %xmm4, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps %xmm13, (%rsi) -; SSE-NEXT: movaps %xmm15, 32(%rsi) -; SSE-NEXT: movaps %xmm11, 16(%rsi) -; SSE-NEXT: movaps %xmm8, 48(%rdx) +; SSE-NEXT: movaps %xmm15, 48(%rsi) +; SSE-NEXT: movaps %xmm14, (%rsi) +; SSE-NEXT: movaps %xmm12, 32(%rsi) +; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movaps %xmm7, 48(%rdx) ; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps %xmm7, 32(%rdx) -; SSE-NEXT: movaps %xmm9, 16(%rdx) -; SSE-NEXT: movaps %xmm6, 48(%rcx) -; SSE-NEXT: movaps %xmm10, 32(%rcx) -; SSE-NEXT: movaps %xmm12, 16(%rcx) +; SSE-NEXT: movaps %xmm6, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps %xmm9, 48(%rcx) +; SSE-NEXT: movaps %xmm11, 32(%rcx) +; SSE-NEXT: movaps %xmm13, 16(%rcx) ; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%r8) @@ -469,103 +469,103 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm5 +; SSE-NEXT: movaps 384(%rdi), %xmm8 ; SSE-NEXT: movaps 160(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm9 ; SSE-NEXT: movaps 480(%rdi), %xmm2 -; SSE-NEXT: movaps 448(%rdi), %xmm7 +; SSE-NEXT: movaps 448(%rdi), %xmm10 ; SSE-NEXT: movaps 224(%rdi), %xmm3 ; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps 288(%rdi), %xmm4 ; SSE-NEXT: movaps 256(%rdi), %xmm13 -; SSE-NEXT: movaps 352(%rdi), %xmm9 +; SSE-NEXT: movaps 352(%rdi), %xmm5 ; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rdi), %xmm10 +; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: movaps 64(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] +; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm8[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps 272(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 336(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: movaps 336(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 400(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: movaps 400(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps 464(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -576,7 +576,8 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -599,22 +600,21 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm6, 96(%rcx) ; SSE-NEXT: movaps %xmm14, 32(%rcx) -; SSE-NEXT: movaps %xmm3, 112(%rcx) -; SSE-NEXT: movaps %xmm12, 48(%rcx) -; SSE-NEXT: movaps %xmm11, 64(%rcx) -; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps %xmm4, 112(%rcx) +; SSE-NEXT: movaps %xmm13, 48(%rcx) +; SSE-NEXT: movaps %xmm10, 64(%rcx) +; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps %xmm9, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm1, 112(%r8) -; SSE-NEXT: movaps %xmm2, 96(%r8) -; SSE-NEXT: movaps %xmm7, 80(%r8) -; SSE-NEXT: movaps %xmm10, 64(%r8) -; SSE-NEXT: movaps %xmm8, 48(%r8) -; SSE-NEXT: movaps %xmm13, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm5, (%r8) +; SSE-NEXT: movaps %xmm2, 112(%r8) +; SSE-NEXT: movaps %xmm5, 96(%r8) +; SSE-NEXT: movaps %xmm8, 80(%r8) +; SSE-NEXT: movaps %xmm7, 64(%r8) +; SSE-NEXT: movaps %xmm12, 48(%r8) +; SSE-NEXT: movaps %xmm11, 32(%r8) +; SSE-NEXT: movaps %xmm15, 16(%r8) +; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; @@ -638,16 +638,16 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill @@ -655,14 +655,14 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm10[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] @@ -689,11 +689,11 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 @@ -765,16 +765,18 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-ONLY-LABEL: load_i64_stride4_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 @@ -790,93 +792,92 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm5[1],ymm14[1],ymm5[3],ymm14[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%r8) -; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -959,15 +960,15 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $664, %rsp # imm = 0x298 ; SSE-NEXT: movaps 416(%rdi), %xmm0 ; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdi), %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm3 ; SSE-NEXT: movaps 128(%rdi), %xmm8 -; SSE-NEXT: movaps 480(%rdi), %xmm2 +; SSE-NEXT: movaps 480(%rdi), %xmm1 ; SSE-NEXT: movaps 448(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm4 +; SSE-NEXT: movaps 224(%rdi), %xmm5 ; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps 288(%rdi), %xmm5 +; SSE-NEXT: movaps 288(%rdi), %xmm4 ; SSE-NEXT: movaps 256(%rdi), %xmm12 -; SSE-NEXT: movaps 608(%rdi), %xmm3 +; SSE-NEXT: movaps 608(%rdi), %xmm2 ; SSE-NEXT: movaps 352(%rdi), %xmm6 ; SSE-NEXT: movaps 320(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm7 @@ -978,29 +979,29 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1009,9 +1010,9 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 576(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 ; SSE-NEXT: movaps 512(%rdi), %xmm1 @@ -1110,14 +1111,14 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 464(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 560(%rdi), %xmm0 ; SSE-NEXT: movaps 528(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1126,46 +1127,46 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 592(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 592(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 688(%rdi), %xmm0 ; SSE-NEXT: movaps 656(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 720(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 720(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps 784(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 848(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 944(%rdi), %xmm0 -; SSE-NEXT: movaps 912(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps 1008(%rdi), %xmm5 +; SSE-NEXT: movaps 912(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 1008(%rdi), %xmm4 ; SSE-NEXT: movaps 976(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; SSE-NEXT: movaps 16(%rdi), %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movaps 16(%rdi), %xmm4 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1231,11 +1232,11 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps %xmm1, 240(%rcx) -; SSE-NEXT: movaps %xmm3, 224(%rcx) -; SSE-NEXT: movaps %xmm6, 208(%rcx) -; SSE-NEXT: movaps %xmm8, 192(%rcx) -; SSE-NEXT: movaps %xmm11, 176(%rcx) -; SSE-NEXT: movaps %xmm14, 160(%rcx) +; SSE-NEXT: movaps %xmm5, 224(%rcx) +; SSE-NEXT: movaps %xmm8, 208(%rcx) +; SSE-NEXT: movaps %xmm10, 192(%rcx) +; SSE-NEXT: movaps %xmm12, 176(%rcx) +; SSE-NEXT: movaps %xmm15, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1254,19 +1255,19 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm6, (%rcx) ; SSE-NEXT: movaps %xmm2, 240(%r8) -; SSE-NEXT: movaps %xmm4, 224(%r8) +; SSE-NEXT: movaps %xmm3, 224(%r8) ; SSE-NEXT: movaps %xmm7, 208(%r8) ; SSE-NEXT: movaps %xmm9, 192(%r8) -; SSE-NEXT: movaps %xmm12, 176(%r8) +; SSE-NEXT: movaps %xmm11, 176(%r8) ; SSE-NEXT: movaps %xmm13, 160(%r8) -; SSE-NEXT: movaps %xmm15, 144(%r8) +; SSE-NEXT: movaps %xmm14, 144(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%r8) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) @@ -1278,7 +1279,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r8) +; SSE-NEXT: movaps %xmm4, (%r8) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; @@ -1289,13 +1290,13 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm7 @@ -1313,17 +1314,17 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -1393,11 +1394,11 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 @@ -1405,38 +1406,38 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 @@ -1460,54 +1461,54 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1588,10 +1589,10 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) @@ -1630,11 +1631,11 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm10, %ymm10 @@ -1654,7 +1655,7 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1670,186 +1671,188 @@ define void @load_i64_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm15[0],ymm8[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%r8) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2320,7 +2323,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 1040(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1136(%rdi), %xmm0 @@ -2371,55 +2374,55 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 1584(%rdi), %xmm0 ; SSE-NEXT: movaps 1552(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1648(%rdi), %xmm0 -; SSE-NEXT: movaps 1616(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 1616(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 1712(%rdi), %xmm0 -; SSE-NEXT: movaps 1680(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1680(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 1776(%rdi), %xmm0 ; SSE-NEXT: movaps 1744(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 1840(%rdi), %xmm0 -; SSE-NEXT: movaps 1808(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 1808(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 1904(%rdi), %xmm0 -; SSE-NEXT: movaps 1872(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 1968(%rdi), %xmm0 -; SSE-NEXT: movaps 1936(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 1872(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps 2032(%rdi), %xmm13 -; SSE-NEXT: movaps 2000(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps 16(%rdi), %xmm13 +; SSE-NEXT: movaps 1968(%rdi), %xmm0 +; SSE-NEXT: movaps 1936(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 2032(%rdi), %xmm8 +; SSE-NEXT: movaps 2000(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps 16(%rdi), %xmm10 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2548,14 +2551,16 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm4, 496(%rcx) -; SSE-NEXT: movaps %xmm1, 480(%rcx) -; SSE-NEXT: movaps %xmm2, 464(%rcx) -; SSE-NEXT: movaps %xmm3, 448(%rcx) -; SSE-NEXT: movaps %xmm9, 432(%rcx) -; SSE-NEXT: movaps %xmm8, 416(%rcx) -; SSE-NEXT: movaps %xmm11, 400(%rcx) -; SSE-NEXT: movaps %xmm14, 384(%rcx) +; SSE-NEXT: movaps %xmm1, 496(%rcx) +; SSE-NEXT: movaps %xmm3, 480(%rcx) +; SSE-NEXT: movaps %xmm5, 464(%rcx) +; SSE-NEXT: movaps %xmm7, 448(%rcx) +; SSE-NEXT: movaps %xmm11, 432(%rcx) +; SSE-NEXT: movaps %xmm14, 416(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2570,7 +2575,7 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 288(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) @@ -2602,19 +2607,17 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movaps %xmm5, 496(%r8) -; SSE-NEXT: movaps %xmm6, 480(%r8) -; SSE-NEXT: movaps %xmm7, 464(%r8) -; SSE-NEXT: movaps %xmm10, 448(%r8) +; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps %xmm2, 496(%r8) +; SSE-NEXT: movaps %xmm4, 480(%r8) +; SSE-NEXT: movaps %xmm6, 464(%r8) +; SSE-NEXT: movaps %xmm9, 448(%r8) ; SSE-NEXT: movaps %xmm12, 432(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 416(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%r8) +; SSE-NEXT: movaps %xmm13, 416(%r8) +; SSE-NEXT: movaps %xmm15, 400(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%r8) @@ -2660,13 +2663,13 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm13, (%r8) +; SSE-NEXT: movaps %xmm10, (%r8) ; SSE-NEXT: addq $1688, %rsp # imm = 0x698 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2680, %rsp # imm = 0xA78 +; AVX1-ONLY-NEXT: subq $2728, %rsp # imm = 0xAA8 ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -2859,40 +2862,40 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 @@ -2907,417 +2910,421 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1712(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1680(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1712(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1680(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm8[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r8) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 480(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 448(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 384(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 320(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 256(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) -; AVX1-ONLY-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) +; AVX1-ONLY-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3453,402 +3460,404 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm10[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm8[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 384(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 288(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) @@ -3859,188 +3868,188 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i64_stride4_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] +; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] +; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm22, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm18, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm7 +; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [2,6,10,14,2,6,10,14] -; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] +; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm29, %zmm7 +; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm27, %zmm29, %zmm6 +; AVX512-NEXT: vpermt2q %zmm30, %zmm22, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm5 +; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 +; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm2 +; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm31, %zmm1 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm4 +; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm3 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm21 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm30 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm21 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 +; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm17 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm27 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm4 -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm24 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm5 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm18 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm10 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload @@ -4057,97 +4066,97 @@ define void @load_i64_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload ; AVX512-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm19[0,1,2,3],zmm23[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm3[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm14[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512-NEXT: # zmm19 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload -; AVX512-NEXT: # zmm23 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload -; AVX512-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm16[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm18[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload ; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload ; AVX512-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload +; AVX512-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload ; AVX512-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm27[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm29[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm27 # 64-byte Folded Reload -; AVX512-NEXT: # zmm27 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload -; AVX512-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm21[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rsi) +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload +; AVX512-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload +; AVX512-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload +; AVX512-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm30, 320(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm26, 256(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm22, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm14, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm24, 448(%rcx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) +; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm29, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 384(%r8) -; AVX512-NEXT: vmovdqa64 %zmm4, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm17, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm20, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm8, 128(%r8) -; AVX512-NEXT: vmovdqa64 %zmm29, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm17, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm3, 384(%r8) +; AVX512-NEXT: vmovdqa64 %zmm11, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm23, 256(%r8) +; AVX512-NEXT: vmovdqa64 %zmm24, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm27, 192(%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm27, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index a1c9ad9e8ded6..b677dd7079434 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -361,69 +361,69 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd 224(%rdi), %xmm0 ; SSE-NEXT: movapd 256(%rdi), %xmm4 ; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 288(%rdi), %xmm6 -; SSE-NEXT: movapd 208(%rdi), %xmm5 -; SSE-NEXT: movapd (%rdi), %xmm8 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm13 -; SSE-NEXT: movapd 48(%rdi), %xmm9 -; SSE-NEXT: movapd 240(%rdi), %xmm10 -; SSE-NEXT: movapd 272(%rdi), %xmm14 -; SSE-NEXT: movapd 160(%rdi), %xmm11 +; SSE-NEXT: movapd 288(%rdi), %xmm7 +; SSE-NEXT: movapd 208(%rdi), %xmm6 +; SSE-NEXT: movapd (%rdi), %xmm9 +; SSE-NEXT: movapd 16(%rdi), %xmm5 +; SSE-NEXT: movapd 32(%rdi), %xmm14 +; SSE-NEXT: movapd 48(%rdi), %xmm8 +; SSE-NEXT: movapd 240(%rdi), %xmm11 +; SSE-NEXT: movapd 272(%rdi), %xmm13 +; SSE-NEXT: movapd 160(%rdi), %xmm10 ; SSE-NEXT: movapd 192(%rdi), %xmm15 ; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm5[0] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm8[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movapd %xmm14, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm9[0],xmm15[1] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm13, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 80(%rdi), %xmm14 +; SSE-NEXT: movapd 80(%rdi), %xmm13 ; SSE-NEXT: movapd 112(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] +; SSE-NEXT: movapd %xmm4, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] ; SSE-NEXT: movapd 128(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm0[0] ; SSE-NEXT: movapd 96(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm3, 16(%rsi) -; SSE-NEXT: movapd %xmm13, 48(%rsi) +; SSE-NEXT: movapd 144(%rdi), %xmm3 +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm2, 16(%rsi) +; SSE-NEXT: movapd %xmm14, 48(%rsi) ; SSE-NEXT: movapd %xmm15, (%rsi) ; SSE-NEXT: movapd %xmm12, 32(%rsi) -; SSE-NEXT: movapd %xmm14, 16(%rdx) -; SSE-NEXT: movapd %xmm10, 48(%rdx) -; SSE-NEXT: movapd %xmm8, (%rdx) -; SSE-NEXT: movapd %xmm11, 32(%rdx) +; SSE-NEXT: movapd %xmm13, 16(%rdx) +; SSE-NEXT: movapd %xmm11, 48(%rdx) +; SSE-NEXT: movapd %xmm9, (%rdx) +; SSE-NEXT: movapd %xmm10, 32(%rdx) ; SSE-NEXT: movapd %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm6, 48(%rcx) -; SSE-NEXT: movapd %xmm9, (%rcx) -; SSE-NEXT: movapd %xmm5, 32(%rcx) +; SSE-NEXT: movapd %xmm7, 48(%rcx) +; SSE-NEXT: movapd %xmm8, (%rcx) +; SSE-NEXT: movapd %xmm6, 32(%rcx) ; SSE-NEXT: movapd %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movapd %xmm7, (%r8) +; SSE-NEXT: movapd %xmm5, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movapd %xmm2, 16(%r9) +; SSE-NEXT: movapd %xmm3, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -712,15 +712,15 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movapd 224(%rdi), %xmm1 +; SSE-NEXT: movapd 224(%rdi), %xmm3 ; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 64(%rdi), %xmm3 +; SSE-NEXT: movapd 64(%rdi), %xmm1 ; SSE-NEXT: movapd 176(%rdi), %xmm4 ; SSE-NEXT: movapd 96(%rdi), %xmm5 -; SSE-NEXT: movapd 208(%rdi), %xmm6 +; SSE-NEXT: movapd 208(%rdi), %xmm7 ; SSE-NEXT: movapd 128(%rdi), %xmm8 ; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 +; SSE-NEXT: movapd 16(%rdi), %xmm6 ; SSE-NEXT: movapd 32(%rdi), %xmm14 ; SSE-NEXT: movapd 48(%rdi), %xmm9 ; SSE-NEXT: movapd 160(%rdi), %xmm11 @@ -732,15 +732,15 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm3[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] @@ -752,14 +752,14 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm13, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm6[0] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -776,13 +776,13 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 320(%rdi), %xmm14 +; SSE-NEXT: movapd 320(%rdi), %xmm15 ; SSE-NEXT: movapd 352(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 368(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] ; SSE-NEXT: movapd 336(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -793,65 +793,65 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 400(%rdi), %xmm11 ; SSE-NEXT: movapd 432(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm11[0],xmm15[1] +; SSE-NEXT: movapd %xmm0, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd 448(%rdi), %xmm12 ; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm12[0] -; SSE-NEXT: movapd 416(%rdi), %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] +; SSE-NEXT: movapd 416(%rdi), %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1] ; SSE-NEXT: movapd 464(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 480(%rdi), %xmm2 -; SSE-NEXT: movapd 512(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] -; SSE-NEXT: movapd 528(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm3[0] -; SSE-NEXT: movapd 496(%rdi), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] -; SSE-NEXT: movapd 544(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] -; SSE-NEXT: movapd 560(%rdi), %xmm5 -; SSE-NEXT: movapd 592(%rdi), %xmm10 -; SSE-NEXT: movapd %xmm10, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: movapd 608(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] -; SSE-NEXT: movapd 576(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 624(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] -; SSE-NEXT: movapd %xmm7, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 32(%rsi) -; SSE-NEXT: movapd %xmm6, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm15, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movapd %xmm2, 96(%rdx) +; SSE-NEXT: movapd 480(%rdi), %xmm4 +; SSE-NEXT: movapd 512(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd 528(%rdi), %xmm7 +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] +; SSE-NEXT: movapd 496(%rdi), %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd 544(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm10[0] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd 560(%rdi), %xmm0 +; SSE-NEXT: movapd 592(%rdi), %xmm5 +; SSE-NEXT: movapd %xmm5, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd 608(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: movapd 576(%rdi), %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd 624(%rdi), %xmm8 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: movapd %xmm6, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rsi) +; SSE-NEXT: movapd %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm5, 112(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movapd %xmm14, 64(%rdx) +; SSE-NEXT: movaps %xmm2, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movapd %xmm11, 80(%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movapd %xmm13, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movapd %xmm4, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 96(%rcx) -; SSE-NEXT: movapd %xmm0, 112(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movapd %xmm0, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movapd %xmm15, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm11, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movapd %xmm7, 96(%rcx) +; SSE-NEXT: movapd %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movapd %xmm12, 80(%rcx) @@ -863,9 +863,9 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm1, 112(%r8) -; SSE-NEXT: movapd %xmm8, 96(%r8) -; SSE-NEXT: movapd %xmm13, 80(%r8) +; SSE-NEXT: movapd %xmm3, 112(%r8) +; SSE-NEXT: movapd %xmm9, 96(%r8) +; SSE-NEXT: movapd %xmm14, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -876,8 +876,8 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm4, 112(%r9) -; SSE-NEXT: movapd %xmm9, 96(%r9) +; SSE-NEXT: movapd %xmm8, 112(%r9) +; SSE-NEXT: movapd %xmm10, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -895,174 +895,174 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $376, %rsp # imm = 0x178 +; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3] ; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm11[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm14[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm6[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm15[0],ymm2[3],ymm15[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[3],ymm15[2] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[3],ymm11[2] ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm5[0],ymm14[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm9[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm9[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[3],ymm10[2] ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm12[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm3, (%r8) +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm7, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%r9) -; AVX1-ONLY-NEXT: addq $376, %rsp # imm = 0x178 +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: addq $360, %rsp # imm = 0x168 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1071,18 +1071,18 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 @@ -1090,147 +1090,147 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm10[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 64(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1446,54 +1446,54 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $920, %rsp # imm = 0x398 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 144(%rdi), %xmm1 -; SSE-NEXT: movapd 64(%rdi), %xmm2 -; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 96(%rdi), %xmm4 -; SSE-NEXT: movapd 208(%rdi), %xmm6 -; SSE-NEXT: movapd 128(%rdi), %xmm7 -; SSE-NEXT: movapd (%rdi), %xmm9 -; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd 32(%rdi), %xmm13 -; SSE-NEXT: movapd 48(%rdi), %xmm8 -; SSE-NEXT: movapd 160(%rdi), %xmm10 -; SSE-NEXT: movapd 192(%rdi), %xmm14 -; SSE-NEXT: movapd 80(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm13, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: movapd 224(%rdi), %xmm5 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 64(%rdi), %xmm3 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: movapd 208(%rdi), %xmm9 +; SSE-NEXT: movapd 128(%rdi), %xmm10 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm8 +; SSE-NEXT: movapd 32(%rdi), %xmm1 +; SSE-NEXT: movapd 48(%rdi), %xmm11 +; SSE-NEXT: movapd 160(%rdi), %xmm13 +; SSE-NEXT: movapd 192(%rdi), %xmm0 +; SSE-NEXT: movapd 80(%rdi), %xmm14 +; SSE-NEXT: movapd 112(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm1, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm8[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm1[0] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm10[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -1639,91 +1639,91 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 960(%rdi), %xmm10 -; SSE-NEXT: movapd 992(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm14 +; SSE-NEXT: movapd 992(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] ; SSE-NEXT: movapd 1008(%rdi), %xmm15 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm15[0] ; SSE-NEXT: movapd 976(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] -; SSE-NEXT: movapd 1024(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movapd 1024(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1040(%rdi), %xmm8 -; SSE-NEXT: movapd 1072(%rdi), %xmm3 -; SSE-NEXT: movapd %xmm3, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] -; SSE-NEXT: movapd 1088(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movapd 1056(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: movapd 1104(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd 1120(%rdi), %xmm1 -; SSE-NEXT: movapd 1152(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd 1168(%rdi), %xmm6 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm6[0] -; SSE-NEXT: movapd 1136(%rdi), %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm11[0],xmm6[1] -; SSE-NEXT: movapd 1184(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm2[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd 1040(%rdi), %xmm8 +; SSE-NEXT: movapd 1072(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] +; SSE-NEXT: movapd 1088(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm11[0] +; SSE-NEXT: movapd 1056(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: movapd 1104(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd 1120(%rdi), %xmm5 +; SSE-NEXT: movapd 1152(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd 1168(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0] +; SSE-NEXT: movapd 1136(%rdi), %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm12[0],xmm6[1] +; SSE-NEXT: movapd 1184(%rdi), %xmm0 +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1200(%rdi), %xmm0 -; SSE-NEXT: movapd 1232(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd 1248(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm3[0] +; SSE-NEXT: movapd 1232(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd 1248(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm4[0] ; SSE-NEXT: movapd 1216(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] -; SSE-NEXT: movapd 1264(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm12[0] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm5, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movapd %xmm2, 240(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd 1264(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: movapd %xmm3, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 176(%rsi) +; SSE-NEXT: movaps %xmm2, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movapd %xmm1, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movapd %xmm14, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movapd %xmm13, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm1, 224(%rdx) -; SSE-NEXT: movapd %xmm0, 240(%rdx) -; SSE-NEXT: movapd %xmm10, 192(%rdx) -; SSE-NEXT: movapd %xmm8, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm9, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movapd %xmm5, 224(%rdx) +; SSE-NEXT: movapd %xmm0, 240(%rdx) +; SSE-NEXT: movapd %xmm10, 192(%rdx) +; SSE-NEXT: movapd %xmm8, 208(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) @@ -1743,9 +1743,9 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 240(%rcx) +; SSE-NEXT: movapd %xmm4, 240(%rcx) ; SSE-NEXT: movapd %xmm6, 224(%rcx) -; SSE-NEXT: movapd %xmm9, 208(%rcx) +; SSE-NEXT: movapd %xmm11, 208(%rcx) ; SSE-NEXT: movapd %xmm15, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rcx) @@ -1772,7 +1772,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm7, 240(%r8) -; SSE-NEXT: movapd %xmm11, 224(%r8) +; SSE-NEXT: movapd %xmm12, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1801,7 +1801,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm12, 240(%r9) +; SSE-NEXT: movapd %xmm13, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -1837,479 +1837,474 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm9 +; AVX1-ONLY-NEXT: subq $1368, %rsp # imm = 0x558 +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm9[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm6[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm6[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm1[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm2[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm4[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm10[0],ymm7[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm14[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm15[0],ymm6[0],ymm15[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm14[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, (%rsp), %xmm6, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm2[0],ymm12[0],ymm2[3],ymm12[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[3],ymm13[2] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm11[0],ymm8[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm2[0],ymm8[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm15[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm9[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 160(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm13, (%r9) -; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 +; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: addq $1368, %rsp # imm = 0x558 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2317,76 +2312,78 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm14 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm10 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2447,582 +2444,572 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm15, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 128(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r9) +; AVX2-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm20 +; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm13, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm30 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm30, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm17, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm17, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm17 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm22, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <11,0,5,u> -; AVX512F-NEXT: vpermt2q %zmm9, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm18 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm19 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm15 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm12 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm10 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm16 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm14 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm31, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 ; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm18 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm30, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm20 +; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm13, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm30 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm30, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <11,0,5,u> -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm19 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm15 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm12 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm16 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <160 x i64>, ptr %in.vec, align 64 @@ -3042,55 +3029,55 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i64_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2200, %rsp # imm = 0x898 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 144(%rdi), %xmm1 -; SSE-NEXT: movapd 64(%rdi), %xmm2 -; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 96(%rdi), %xmm4 -; SSE-NEXT: movapd 208(%rdi), %xmm6 -; SSE-NEXT: movapd 128(%rdi), %xmm7 -; SSE-NEXT: movapd (%rdi), %xmm9 -; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd 32(%rdi), %xmm13 -; SSE-NEXT: movapd 48(%rdi), %xmm8 -; SSE-NEXT: movapd 160(%rdi), %xmm10 -; SSE-NEXT: movapd 192(%rdi), %xmm14 -; SSE-NEXT: movapd 80(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm13, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: subq $2216, %rsp # imm = 0x8A8 +; SSE-NEXT: movapd 224(%rdi), %xmm5 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 64(%rdi), %xmm3 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: movapd 208(%rdi), %xmm10 +; SSE-NEXT: movapd 128(%rdi), %xmm9 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm8 +; SSE-NEXT: movapd 32(%rdi), %xmm0 +; SSE-NEXT: movapd 48(%rdi), %xmm11 +; SSE-NEXT: movapd 160(%rdi), %xmm13 +; SSE-NEXT: movapd 192(%rdi), %xmm1 +; SSE-NEXT: movapd 80(%rdi), %xmm14 +; SSE-NEXT: movapd 112(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm8[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm11[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm1[0] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm9[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm10[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -3450,7 +3437,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2048(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 2016(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3477,24 +3464,24 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2160(%rdi), %xmm14 ; SSE-NEXT: movapd 2192(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm14[0],xmm13[1] +; SSE-NEXT: movapd %xmm0, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] ; SSE-NEXT: movapd 2208(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] ; SSE-NEXT: movapd 2176(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2224(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 2240(%rdi), %xmm11 +; SSE-NEXT: movapd 2240(%rdi), %xmm12 ; SSE-NEXT: movapd 2272(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1] +; SSE-NEXT: movapd %xmm0, %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm12[0],xmm8[1] ; SSE-NEXT: movapd 2288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movapd 2256(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3503,110 +3490,111 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 2320(%rdi), %xmm7 -; SSE-NEXT: movapd 2352(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: movapd 2320(%rdi), %xmm9 +; SSE-NEXT: movapd 2352(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] ; SSE-NEXT: movapd 2368(%rdi), %xmm15 -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm15[0] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm15[0] ; SSE-NEXT: movapd 2336(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] -; SSE-NEXT: movapd 2384(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movapd 2384(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 2400(%rdi), %xmm6 -; SSE-NEXT: movapd 2432(%rdi), %xmm10 -; SSE-NEXT: movapd %xmm10, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] -; SSE-NEXT: movapd 2448(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm12[0] -; SSE-NEXT: movapd 2416(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: movapd 2464(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 2480(%rdi), %xmm0 -; SSE-NEXT: movapd 2512(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: movapd 2400(%rdi), %xmm7 +; SSE-NEXT: movapd 2432(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd 2448(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm13[0] +; SSE-NEXT: movapd 2416(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: movapd 2464(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd 2528(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm8[0] -; SSE-NEXT: movapd 2496(%rdi), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] -; SSE-NEXT: movapd 2544(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm2[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm1, 496(%rsi) -; SSE-NEXT: movapd %xmm3, 480(%rsi) -; SSE-NEXT: movapd %xmm5, 464(%rsi) -; SSE-NEXT: movapd %xmm9, 448(%rsi) -; SSE-NEXT: movapd %xmm13, 432(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 416(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 368(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movapd %xmm0, 496(%rdx) -; SSE-NEXT: movapd %xmm6, 480(%rdx) -; SSE-NEXT: movapd %xmm7, 464(%rdx) -; SSE-NEXT: movapd %xmm11, 448(%rdx) +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 2480(%rdi), %xmm5 +; SSE-NEXT: movapd 2512(%rdi), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: movapd 2528(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm10[0] +; SSE-NEXT: movapd 2496(%rdi), %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd 2544(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm2, 496(%rsi) +; SSE-NEXT: movapd %xmm4, 480(%rsi) +; SSE-NEXT: movapd %xmm6, 464(%rsi) +; SSE-NEXT: movapd %xmm8, 448(%rsi) +; SSE-NEXT: movapd %xmm11, 432(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 416(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movapd %xmm5, 496(%rdx) +; SSE-NEXT: movapd %xmm7, 480(%rdx) +; SSE-NEXT: movapd %xmm9, 464(%rdx) +; SSE-NEXT: movapd %xmm12, 448(%rdx) ; SSE-NEXT: movapd %xmm14, 432(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rdx) @@ -3658,12 +3646,12 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movapd %xmm8, 496(%rcx) -; SSE-NEXT: movapd %xmm12, 480(%rcx) +; SSE-NEXT: movapd %xmm10, 496(%rcx) +; SSE-NEXT: movapd %xmm13, 480(%rcx) ; SSE-NEXT: movapd %xmm15, 464(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rcx) @@ -3719,7 +3707,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm10, 496(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3782,7 +3771,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm2, 496(%r9) +; SSE-NEXT: movapd %xmm1, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3845,215 +3834,213 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: addq $2200, %rsp # imm = 0x898 +; SSE-NEXT: addq $2216, %rsp # imm = 0x8A8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3288, %rsp # imm = 0xCD8 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $3256, %rsp # imm = 0xCB8 +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm13[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm6[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1856(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm9[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovapd 1792(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2080(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm10[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1856(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1792(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2080(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm11[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm14[0],ymm2[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] @@ -4062,43 +4049,38 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm1 @@ -4107,22 +4089,26 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 @@ -4147,223 +4133,222 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, (%rsp), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm11[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 1776(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 2096(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 1936(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1936(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1296(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 1296(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm7[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm8[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] @@ -4373,52 +4358,52 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[3],ymm6[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] ; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[3],ymm5[2] ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %ymm14 @@ -4428,7 +4413,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm11 @@ -4472,8 +4457,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd 2528(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4698,7 +4682,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) @@ -4734,21 +4718,21 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $3288, %rsp # imm = 0xCD8 +; AVX1-ONLY-NEXT: addq $3256, %rsp # imm = 0xCB8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $3240, %rsp # imm = 0xCA8 -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 @@ -4761,317 +4745,316 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2496(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm13[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm14[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm15[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5187,8 +5170,8 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5204,11 +5187,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5219,22 +5202,23 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] @@ -5248,14 +5232,14 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] @@ -5270,7 +5254,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] @@ -5278,15 +5262,16 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] @@ -5321,87 +5306,87 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] @@ -5537,22 +5522,22 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 448(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 416(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 384(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 352(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 320(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 288(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 256(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 192(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 128(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 448(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 416(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 384(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 352(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 320(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 288(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 256(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 160(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 128(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX2-ONLY-NEXT: addq $3240, %rsp # imm = 0xCA8 @@ -5561,949 +5546,937 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm26 ; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm9, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm19 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 ; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm6 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <11,0,5,u> -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm31 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm27 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm26 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm28, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm9 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm12, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm5 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm29[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm18 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm14 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm17 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm19 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm28, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm28, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm30 {%k1} -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm10 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 256(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 384(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 320(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%r9) -; AVX512F-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm19 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <11,0,5,u> -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm26 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm9 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm12, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm5 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm29[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm30 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) -; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <320 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 05f07039fd67e..ffdee1430c711 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -402,109 +402,109 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps 208(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm12 -; SSE-NEXT: movaps 304(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm15 -; SSE-NEXT: movaps (%rdi), %xmm3 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 240(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdi), %xmm5 -; SSE-NEXT: movaps 336(%rdi), %xmm11 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] -; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] +; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 256(%rdi), %xmm13 +; SSE-NEXT: movaps 208(%rdi), %xmm5 +; SSE-NEXT: movaps 352(%rdi), %xmm15 +; SSE-NEXT: movaps 304(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps 240(%rdi), %xmm3 +; SSE-NEXT: movaps 192(%rdi), %xmm12 +; SSE-NEXT: movaps 336(%rdi), %xmm4 +; SSE-NEXT: movaps 288(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] +; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps 320(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps 272(%rdi), %xmm1 -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm13, 16(%rsi) -; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps %xmm9, 16(%rcx) -; SSE-NEXT: movaps %xmm12, 32(%rcx) -; SSE-NEXT: movaps %xmm15, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm7, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movaps 80(%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 320(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm0 +; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm12, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm9, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm7, 16(%rcx) +; SSE-NEXT: movaps %xmm15, 32(%rcx) +; SSE-NEXT: movaps %xmm14, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm13, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps %xmm2, 16(%r9) -; SSE-NEXT: movaps %xmm5, 32(%r9) +; SSE-NEXT: movaps %xmm4, 32(%r9) ; SSE-NEXT: movaps %xmm6, 48(%r9) -; SSE-NEXT: movaps %xmm10, (%r9) +; SSE-NEXT: movaps %xmm8, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps %xmm8, (%rax) +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm5, 48(%rax) +; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; @@ -597,54 +597,54 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm8[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm8[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm11[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm12[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm15[1],ymm9[3],ymm15[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm14[1],xmm13[1] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm10[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] @@ -656,20 +656,20 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm14[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm9, (%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r9) @@ -682,80 +682,80 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-LABEL: load_i64_stride6_vf8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <0,6,12,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: movb $56, %dil ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 ; AVX512F-NEXT: movb $-64, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <1,7,13,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <10,0,6,u> ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: movb $24, %dil -; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} -; AVX512F-NEXT: movb $-32, %dil ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512F-NEXT: movb $-32, %dil +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <11,1,7,u> ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512F-NEXT: vinserti32x4 $0, %xmm7, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 {%k2} -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) @@ -767,80 +767,80 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,6,12,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,12,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: movb $56, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm4, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <1,7,13,u> ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <10,0,6,u> ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: movb $24, %dil -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} -; AVX512BW-NEXT: movb $-32, %dil ; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512BW-NEXT: movb $-32, %dil +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <11,1,7,u> ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) @@ -949,7 +949,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 304(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm0 @@ -958,25 +958,26 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps 496(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 640(%rdi), %xmm0 -; SSE-NEXT: movaps 592(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 592(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 688(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps 688(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -991,26 +992,25 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 272(%rdi), %xmm0 -; SSE-NEXT: movaps 224(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 320(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 464(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 416(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 560(%rdi), %xmm0 -; SSE-NEXT: movaps 512(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps 512(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 656(%rdi), %xmm0 ; SSE-NEXT: movaps 608(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -1063,16 +1063,17 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm12, 96(%r8) -; SSE-NEXT: movaps %xmm14, 80(%r8) +; SSE-NEXT: movaps %xmm11, 112(%r8) +; SSE-NEXT: movaps %xmm15, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%r8) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) @@ -1085,10 +1086,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm2, 112(%r9) ; SSE-NEXT: movaps %xmm4, 96(%r9) ; SSE-NEXT: movaps %xmm7, 80(%r9) -; SSE-NEXT: movaps %xmm9, 64(%r9) -; SSE-NEXT: movaps %xmm15, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm8, 64(%r9) +; SSE-NEXT: movaps %xmm10, 48(%r9) +; SSE-NEXT: movaps %xmm14, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1096,10 +1096,10 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps %xmm3, 96(%rax) -; SSE-NEXT: movaps %xmm5, 80(%rax) -; SSE-NEXT: movaps %xmm6, 64(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movaps %xmm11, 32(%rax) +; SSE-NEXT: movaps %xmm6, 80(%rax) +; SSE-NEXT: movaps %xmm5, 64(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) +; SSE-NEXT: movaps %xmm12, 32(%rax) ; SSE-NEXT: movaps %xmm13, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) @@ -1158,109 +1158,109 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm7[0],ymm15[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm3[0],xmm4[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm7[1],ymm15[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm12[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm13[0],ymm2[2],ymm13[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm14[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm14[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm8[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) @@ -1294,7 +1294,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1305,221 +1305,222 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) ; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm15[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm9[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm5[0],ymm12[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm15[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm3[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm10[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm8[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm13[1],ymm1[1],ymm13[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm9[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm5[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm14, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rax) -; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) +; AVX2-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1830,15 +1831,15 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 ; SSE-NEXT: movaps 624(%rdi), %xmm0 ; SSE-NEXT: movaps 576(%rdi), %xmm9 -; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rdi), %xmm3 ; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 720(%rdi), %xmm2 +; SSE-NEXT: movaps 720(%rdi), %xmm1 ; SSE-NEXT: movaps 672(%rdi), %xmm11 -; SSE-NEXT: movaps 336(%rdi), %xmm4 +; SSE-NEXT: movaps 336(%rdi), %xmm5 ; SSE-NEXT: movaps 288(%rdi), %xmm10 -; SSE-NEXT: movaps 432(%rdi), %xmm5 +; SSE-NEXT: movaps 432(%rdi), %xmm4 ; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 912(%rdi), %xmm3 +; SSE-NEXT: movaps 912(%rdi), %xmm2 ; SSE-NEXT: movaps 528(%rdi), %xmm6 ; SSE-NEXT: movaps 480(%rdi), %xmm14 ; SSE-NEXT: movaps 144(%rdi), %xmm7 @@ -1849,29 +1850,29 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] +; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -1880,9 +1881,9 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 864(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps 768(%rdi), %xmm1 @@ -2084,7 +2085,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 416(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 560(%rdi), %xmm0 @@ -2093,7 +2094,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 656(%rdi), %xmm0 ; SSE-NEXT: movaps 608(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2102,23 +2103,23 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 704(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 704(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 848(%rdi), %xmm0 -; SSE-NEXT: movaps 800(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 800(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 944(%rdi), %xmm0 -; SSE-NEXT: movaps 896(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 1040(%rdi), %xmm0 ; SSE-NEXT: movaps 992(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, %xmm12 @@ -2293,7 +2294,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) @@ -2310,12 +2311,12 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm7, 192(%rax) ; SSE-NEXT: movaps %xmm9, 176(%rax) ; SSE-NEXT: movaps %xmm10, 160(%rax) -; SSE-NEXT: movaps %xmm14, 144(%rax) -; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movaps %xmm15, 128(%rax) +; SSE-NEXT: movaps %xmm14, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -2506,26 +2507,26 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] @@ -2574,8 +2575,8 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] @@ -2585,7 +2586,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2597,13 +2598,13 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -2720,409 +2721,412 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i64_stride6_vf32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i64_stride6_vf32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm6[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm5[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm13[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm4[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm5[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm11[0],ymm5[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm11[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm14[0],ymm3[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] ; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm15[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm11[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 @@ -3130,40 +3134,40 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm11[0] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 @@ -3171,486 +3175,490 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm13[1],xmm14[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm14[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm10[1],xmm11[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 904(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 1480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 1480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX2-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,7,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm9, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <10,0,6,u> -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <11,1,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm9, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <10,0,6,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm9, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm13, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,6,12,0,0,6,12] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,7,13,0,1,7,13] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,10,0,6,0,10,0,6] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 ; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm3, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3658,347 +3666,351 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,7,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm9, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <10,0,6,u> -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <11,1,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <10,0,6,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm9, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4586,7 +4598,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 1568(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1712(%rdi), %xmm0 @@ -4630,55 +4642,55 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 2288(%rdi), %xmm0 -; SSE-NEXT: movaps 2240(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 2240(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 2384(%rdi), %xmm0 -; SSE-NEXT: movaps 2336(%rdi), %xmm14 +; SSE-NEXT: movaps 2336(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 2480(%rdi), %xmm0 +; SSE-NEXT: movaps 2432(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 2480(%rdi), %xmm0 -; SSE-NEXT: movaps 2432(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 2576(%rdi), %xmm0 -; SSE-NEXT: movaps 2528(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 2528(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 2672(%rdi), %xmm0 ; SSE-NEXT: movaps 2624(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 2768(%rdi), %xmm0 -; SSE-NEXT: movaps 2720(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps 2720(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 2864(%rdi), %xmm0 -; SSE-NEXT: movaps 2816(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 2960(%rdi), %xmm0 -; SSE-NEXT: movaps 2912(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 2864(%rdi), %xmm0 +; SSE-NEXT: movaps 2816(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 2960(%rdi), %xmm0 +; SSE-NEXT: movaps 2912(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps 3056(%rdi), %xmm0 -; SSE-NEXT: movaps 3008(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 3008(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4935,13 +4947,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps %xmm1, 496(%r9) -; SSE-NEXT: movaps %xmm3, 480(%r9) -; SSE-NEXT: movaps %xmm6, 464(%r9) +; SSE-NEXT: movaps %xmm2, 496(%r9) +; SSE-NEXT: movaps %xmm4, 480(%r9) +; SSE-NEXT: movaps %xmm7, 464(%r9) ; SSE-NEXT: movaps %xmm8, 448(%r9) -; SSE-NEXT: movaps %xmm10, 432(%r9) -; SSE-NEXT: movaps %xmm12, 416(%r9) -; SSE-NEXT: movaps %xmm15, 400(%r9) +; SSE-NEXT: movaps %xmm11, 432(%r9) +; SSE-NEXT: movaps %xmm13, 416(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4958,7 +4971,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 288(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r9) @@ -4993,17 +5006,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 496(%rax) -; SSE-NEXT: movaps %xmm4, 480(%rax) -; SSE-NEXT: movaps %xmm5, 464(%rax) -; SSE-NEXT: movaps %xmm7, 448(%rax) +; SSE-NEXT: movaps %xmm1, 496(%rax) +; SSE-NEXT: movaps %xmm3, 480(%rax) +; SSE-NEXT: movaps %xmm6, 464(%rax) +; SSE-NEXT: movaps %xmm5, 448(%rax) ; SSE-NEXT: movaps %xmm9, 432(%rax) -; SSE-NEXT: movaps %xmm11, 416(%rax) -; SSE-NEXT: movaps %xmm13, 400(%rax) -; SSE-NEXT: movaps %xmm14, 384(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 368(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, 416(%rax) +; SSE-NEXT: movaps %xmm14, 400(%rax) +; SSE-NEXT: movaps %xmm12, 384(%rax) +; SSE-NEXT: movaps %xmm15, 368(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) @@ -5511,11 +5523,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 @@ -5639,13 +5651,13 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -5919,297 +5931,297 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $3768, %rsp # imm = 0xEB8 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i64_stride6_vf64: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3416, %rsp # imm = 0xD58 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1776(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $3768, %rsp # imm = 0xEB8 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i64_stride6_vf64: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 1776(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 2160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 2544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 @@ -6218,11 +6230,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 @@ -6231,11 +6243,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 @@ -6268,9 +6280,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 @@ -6279,16 +6292,16 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1584(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1968(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm5 @@ -6403,13 +6416,14 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm14[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm9[1],mem[1] ; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 1640(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] @@ -6417,8 +6431,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm10[1] ; AVX2-ONLY-NEXT: vbroadcastsd 2024(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm6[1] @@ -6517,7 +6530,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 2704(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6534,11 +6547,11 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastsd 2080(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1936(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1936(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1696(%rdi), %ymm0 @@ -6562,63 +6575,66 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm6[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm3[0],xmm15[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm13[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6626,17 +6642,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6646,7 +6664,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6666,7 +6684,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6686,10 +6704,9 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] ; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6713,7 +6730,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] ; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 @@ -6734,140 +6751,144 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps %xmm12, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1616(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2000(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 2000(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 2192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2576(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 2576(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 2768(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %xmm3 @@ -6894,7 +6915,7 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6956,38 +6977,38 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vbroadcastsd 1864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm11[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 2632(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 2824(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 3016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 2632(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 2824(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 3016(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7149,10 +7170,10 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 352(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 288(%rax) @@ -7173,596 +7194,599 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3416, %rsp # imm = 0xD58 +; AVX2-ONLY-NEXT: addq $3432, %rsp # imm = 0xD68 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <0,6,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <10,0,6,u> -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <11,1,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <1,7,13,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <10,0,6,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm13, %zmm28 -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm17 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm14, %zmm30 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm31 -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 +; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm12 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} @@ -7788,146 +7812,144 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm29, %zmm19, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm22, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 256(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 256(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 256(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 256(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 256(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 320(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512F-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm0, 384(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -7935,589 +7957,592 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,6,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <1,7,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <10,0,6,u> -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <11,1,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <10,0,6,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm28 -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm30 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} @@ -8543,146 +8568,144 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm22, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 384(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 256(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 256(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 256(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 320(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512BW-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 70b718cd7f282..9841686a648a5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -531,16 +531,16 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i64_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $88, %rsp -; SSE-NEXT: movapd 320(%rdi), %xmm0 -; SSE-NEXT: movapd 208(%rdi), %xmm1 -; SSE-NEXT: movapd 256(%rdi), %xmm2 -; SSE-NEXT: movapd 144(%rdi), %xmm3 -; SSE-NEXT: movapd 304(%rdi), %xmm4 -; SSE-NEXT: movapd 192(%rdi), %xmm5 -; SSE-NEXT: movapd 240(%rdi), %xmm6 -; SSE-NEXT: movapd 128(%rdi), %xmm7 -; SSE-NEXT: movapd 288(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 +; SSE-NEXT: movapd 320(%rdi), %xmm1 +; SSE-NEXT: movapd 208(%rdi), %xmm0 +; SSE-NEXT: movapd 256(%rdi), %xmm3 +; SSE-NEXT: movapd 144(%rdi), %xmm2 +; SSE-NEXT: movapd 304(%rdi), %xmm5 +; SSE-NEXT: movapd 192(%rdi), %xmm4 +; SSE-NEXT: movapd 240(%rdi), %xmm7 +; SSE-NEXT: movapd 128(%rdi), %xmm6 +; SSE-NEXT: movapd 288(%rdi), %xmm9 +; SSE-NEXT: movapd 176(%rdi), %xmm8 ; SSE-NEXT: movapd 336(%rdi), %xmm10 ; SSE-NEXT: movapd 224(%rdi), %xmm11 ; SSE-NEXT: movapd 272(%rdi), %xmm14 @@ -549,21 +549,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd %xmm15, %xmm12 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] -; SSE-NEXT: movapd %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm11[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm8[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm8[0] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] @@ -571,49 +557,63 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 384(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm9[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 384(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] ; SSE-NEXT: movapd 400(%rdi), %xmm7 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 352(%rdi), %xmm8 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm8[0],xmm7[1] -; SSE-NEXT: movapd 416(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movapd 368(%rdi), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1] -; SSE-NEXT: movapd 432(%rdi), %xmm11 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm11[0] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] -; SSE-NEXT: movapd (%rdi), %xmm5 -; SSE-NEXT: movapd 48(%rdi), %xmm12 -; SSE-NEXT: movapd %xmm12, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: movapd 416(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm10[0] +; SSE-NEXT: movapd 368(%rdi), %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] +; SSE-NEXT: movapd 432(%rdi), %xmm14 +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm14[0] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd (%rdi), %xmm2 +; SSE-NEXT: movapd 48(%rdi), %xmm9 +; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movapd 64(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] ; SSE-NEXT: movapd 16(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 80(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] -; SSE-NEXT: movapd 32(%rdi), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE-NEXT: movapd 96(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] -; SSE-NEXT: movapd %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm14, 48(%rsi) -; SSE-NEXT: movapd %xmm15, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movapd %xmm5, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movapd 80(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm4[0] +; SSE-NEXT: movapd 32(%rdi), %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd %xmm3, (%rsi) +; SSE-NEXT: movapd %xmm11, 48(%rsi) +; SSE-NEXT: movapd %xmm12, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movapd %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movapd %xmm13, 16(%rdx) ; SSE-NEXT: movapd %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm7, 48(%rcx) @@ -627,22 +627,22 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm2, (%r9) -; SSE-NEXT: movapd %xmm9, 48(%r9) +; SSE-NEXT: movapd %xmm4, (%r9) +; SSE-NEXT: movapd %xmm10, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm3, (%rax) -; SSE-NEXT: movapd %xmm10, 48(%rax) +; SSE-NEXT: movapd %xmm5, (%rax) +; SSE-NEXT: movapd %xmm15, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, (%rax) -; SSE-NEXT: movapd %xmm11, 48(%rax) +; SSE-NEXT: movapd %xmm6, (%rax) +; SSE-NEXT: movapd %xmm14, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -652,7 +652,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm6 @@ -663,7 +663,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -677,7 +677,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[3],ymm8[2] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm10[0],ymm7[3],ymm10[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3] @@ -688,11 +688,11 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1],ymm9[0],ymm11[2],ymm9[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[2] ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 @@ -706,8 +706,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm10[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] @@ -738,12 +738,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm8, (%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm10, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovapd %ymm1, 32(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm5, (%rax) @@ -756,9 +756,9 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i64_stride7_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 @@ -783,7 +783,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] @@ -792,20 +792,20 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 @@ -814,8 +814,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] @@ -824,17 +824,17 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -842,129 +842,129 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride7_vf8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,u> -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <0,7,14,u> +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512F-NEXT: movb $24, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm7[4,5,4,5],zmm6[4,5,4,5] +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 ; AVX512F-NEXT: movb $-32, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k2} +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,10,0,5,6,10] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm10, %zmm10 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [4,11] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,11,4,11] -; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm8, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm13 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] +; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> -; AVX512F-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 {%k2} +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 {%k1} ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,11,0,5,6,11] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm13, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} +; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm9, %zmm12 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] -; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%r10) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -972,110 +972,110 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,u> -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,7,14,u> +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: movb $24, %r11b -; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm7[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-NEXT: kmovd %r11d, %k2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm7[4,5,4,5],zmm6[4,5,4,5] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: movb $-32, %r11b -; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k2} +; AVX512BW-NEXT: kmovd %r11d, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm10, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm13 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm8, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm13 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] ; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k1} ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm9, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,7,14,0,0,7,14,0] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm9, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [5,12] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [5,12] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [2,9,2,9,2,9,2,9] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r10) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <56 x i64>, ptr %in.vec, align 64 @@ -1100,58 +1100,58 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride7_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $536, %rsp # imm = 0x218 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm1 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 192(%rdi), %xmm3 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 64(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm14 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm8[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm9[0] +; SSE-NEXT: movapd 208(%rdi), %xmm3 +; SSE-NEXT: movapd 96(%rdi), %xmm2 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 192(%rdi), %xmm6 +; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 128(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm10 +; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 48(%rdi), %xmm0 +; SSE-NEXT: movapd 224(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1170,7 +1170,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd 384(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 400(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1211,85 +1211,85 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 560(%rdi), %xmm13 -; SSE-NEXT: movapd 608(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 608(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 624(%rdi), %xmm14 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm14[0] -; SSE-NEXT: movapd 576(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: movapd 640(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-NEXT: movapd 576(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd 640(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 592(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 592(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 656(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-NEXT: movapd 656(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 672(%rdi), %xmm2 -; SSE-NEXT: movapd 720(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] -; SSE-NEXT: movapd 736(%rdi), %xmm6 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm6[0] -; SSE-NEXT: movapd 688(%rdi), %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd 672(%rdi), %xmm6 +; SSE-NEXT: movapd 720(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: movapd 736(%rdi), %xmm8 +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0] +; SSE-NEXT: movapd 688(%rdi), %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] ; SSE-NEXT: movapd 752(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm12[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm12[0] ; SSE-NEXT: movapd 704(%rdi), %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] -; SSE-NEXT: movapd 768(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 784(%rdi), %xmm4 -; SSE-NEXT: movapd 832(%rdi), %xmm11 -; SSE-NEXT: movapd %xmm11, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] -; SSE-NEXT: movapd 848(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm0[0] -; SSE-NEXT: movapd 800(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 864(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] -; SSE-NEXT: movapd 816(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] -; SSE-NEXT: movapd 880(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm11[0],xmm10[1] -; SSE-NEXT: movapd %xmm8, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 32(%rsi) -; SSE-NEXT: movapd %xmm5, 112(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movapd %xmm2, 96(%rdx) +; SSE-NEXT: movapd 768(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 784(%rdi), %xmm0 +; SSE-NEXT: movapd 832(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd 848(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: movapd 800(%rdi), %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd 864(%rdi), %xmm5 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0] +; SSE-NEXT: movapd 816(%rdi), %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm9[0],xmm5[1] +; SSE-NEXT: movapd 880(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] +; SSE-NEXT: movapd %xmm7, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movapd %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm4, 112(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps %xmm2, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movapd %xmm13, 80(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movapd %xmm6, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movapd %xmm6, 96(%rcx) -; SSE-NEXT: movapd %xmm0, 112(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movapd %xmm0, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm13, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movapd %xmm8, 96(%rcx) +; SSE-NEXT: movapd %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movapd %xmm14, 80(%rcx) @@ -1301,8 +1301,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm1, 112(%r8) -; SSE-NEXT: movapd %xmm9, 96(%r8) +; SSE-NEXT: movapd %xmm3, 112(%r8) +; SSE-NEXT: movapd %xmm10, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1315,7 +1315,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm3, 112(%r9) +; SSE-NEXT: movapd %xmm5, 112(%r9) ; SSE-NEXT: movapd %xmm12, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) @@ -1330,9 +1330,9 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm7, 112(%rax) +; SSE-NEXT: movapd %xmm9, 112(%rax) ; SSE-NEXT: movapd %xmm15, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -1345,7 +1345,7 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm10, 112(%rax) +; SSE-NEXT: movapd %xmm11, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1423,182 +1423,182 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm15[0],ymm5[2],ymm15[2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[1],ymm11[0],ymm9[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm6[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm2[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm8[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm3[0],ymm10[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[0],ymm6[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[3],ymm15[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[0],ymm7[3],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 @@ -1626,9 +1626,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 @@ -1703,8 +1702,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1713,7 +1712,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm0[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] @@ -1726,58 +1726,57 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm2[1],ymm14[3],ymm2[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1802,7 +1801,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 64(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1810,21 +1810,21 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 96(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 32(%rax) -; AVX2-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2221,58 +2221,58 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1448, %rsp # imm = 0x5A8 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm1 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 192(%rdi), %xmm3 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 64(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm14 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm8[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm9[0] +; SSE-NEXT: movapd 208(%rdi), %xmm3 +; SSE-NEXT: movapd 96(%rdi), %xmm2 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 192(%rdi), %xmm6 +; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 128(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm10 +; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 48(%rdi), %xmm0 +; SSE-NEXT: movapd 224(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2485,13 +2485,13 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1344(%rdi), %xmm13 +; SSE-NEXT: movapd 1344(%rdi), %xmm14 ; SSE-NEXT: movapd 1392(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1408(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] ; SSE-NEXT: movapd 1360(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2507,30 +2507,30 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1456(%rdi), %xmm9 -; SSE-NEXT: movapd 1504(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm9[0],xmm14[1] -; SSE-NEXT: movapd 1520(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm12[0] +; SSE-NEXT: movapd 1504(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: movapd 1520(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm13[0] ; SSE-NEXT: movapd 1472(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm2[0],xmm12[1] -; SSE-NEXT: movapd 1536(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: movapd 1536(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1488(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1552(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1552(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1568(%rdi), %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1568(%rdi), %xmm6 ; SSE-NEXT: movapd 1616(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] ; SSE-NEXT: movapd 1632(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0] ; SSE-NEXT: movapd 1584(%rdi), %xmm11 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm11[0],xmm8[1] ; SSE-NEXT: movapd 1648(%rdi), %xmm1 @@ -2543,54 +2543,54 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1680(%rdi), %xmm1 -; SSE-NEXT: movapd 1728(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd 1744(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] -; SSE-NEXT: movapd 1696(%rdi), %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] +; SSE-NEXT: movapd 1680(%rdi), %xmm3 +; SSE-NEXT: movapd 1728(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd 1744(%rdi), %xmm5 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0] +; SSE-NEXT: movapd 1696(%rdi), %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE-NEXT: movapd 1760(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm10[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm10[0] ; SSE-NEXT: movapd 1712(%rdi), %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] ; SSE-NEXT: movapd 1776(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm7, 224(%rsi) +; SSE-NEXT: movapd %xmm4, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movapd %xmm2, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movapd %xmm1, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movapd %xmm14, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm5, 224(%rdx) -; SSE-NEXT: movapd %xmm1, 240(%rdx) -; SSE-NEXT: movapd %xmm13, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm12, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movapd %xmm6, 224(%rdx) +; SSE-NEXT: movapd %xmm3, 240(%rdx) +; SSE-NEXT: movapd %xmm14, 192(%rdx) ; SSE-NEXT: movapd %xmm9, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rdx) @@ -2616,9 +2616,9 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 240(%rcx) +; SSE-NEXT: movapd %xmm5, 240(%rcx) ; SSE-NEXT: movapd %xmm8, 224(%rcx) -; SSE-NEXT: movapd %xmm12, 208(%rcx) +; SSE-NEXT: movapd %xmm13, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2645,7 +2645,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm6, 240(%r8) +; SSE-NEXT: movapd %xmm7, 240(%r8) ; SSE-NEXT: movapd %xmm11, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) @@ -2776,4519 +2776,4577 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1720, %rsp # imm = 0x6B8 +; AVX1-ONLY-NEXT: subq $1736, %rsp # imm = 0x6C8 ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm11[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm10[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovapd 944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm2[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1344(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1344(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovapd 1392(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm7[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm3[0],xmm0[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[3],ymm9[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[3],ymm13[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm8[0],ymm3[0],ymm8[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm10[0],ymm15[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[3],ymm11[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[0],ymm9[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm5[0],ymm12[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[3],ymm13[2] ; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm11[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm11[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1248(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[1],ymm8[0],ymm12[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[1],ymm5[0],ymm11[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[1],ymm1[0],ymm7[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[1],ymm1[0],ymm13[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[1],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[1],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm1[0],ymm10[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm2[0],ymm6[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm2[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm12[0],xmm8[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm13[0],xmm5[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm11[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm8[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm0[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm3[0],xmm9[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm14[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm9[0],xmm15[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm2[0],ymm4[0],ymm2[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm2[0],ymm15[3],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm6[0],ymm15[0],ymm6[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm8[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[0],ymm1[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[3],ymm8[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] ; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rax) -; AVX1-ONLY-NEXT: addq $1720, %rsp # imm = 0x6B8 +; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, (%rax) +; AVX1-ONLY-NEXT: addq $1736, %rsp # imm = 0x6C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 +; AVX2-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm8[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm9[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm13[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm14[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq 1024(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1024(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 160(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $24, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512F-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] +; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm29[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm13[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: movb $-32, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm15, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i64_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] ; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $24, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] +; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm29[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm13[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512F-ONLY-FAST-NEXT: movb $-32, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm15, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] ; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $24, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQ-SLOW-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,11,4,11] +; AVX512DQ-SLOW-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm29[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[4,5,4,5],zmm9[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512DQ-SLOW-NEXT: movb $-32, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm15, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i64_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] ; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $24, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,11,4,11] +; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm29[4,5,4,5] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[4,5,4,5],zmm9[4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 ; AVX512DQ-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 ; AVX512DQ-FAST-NEXT: movb $-32, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 192(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm15, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] ; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] ; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[4,5,4,5],zmm13[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm30[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: movb $-32, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i64_stride7_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] ; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $24, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[4,5,4,5],zmm13[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm30[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: movb $-32, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] ; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] ; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $24, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [4,11,4,11] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm30[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512DQBW-SLOW-NEXT: movb $-32, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQBW-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i64_stride7_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm11 ; AVX512DQBW-FAST-NEXT: vmovaps 576(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] ; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] +; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] ; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $24, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[4,5,4,5],zmm12[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQBW-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [4,11,4,11] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[4,5,4,5],zmm13[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm30[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm5 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 ; AVX512DQBW-FAST-NEXT: movb $-32, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 192(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512DQBW-FAST-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQBW-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <224 x i64>, ptr %in.vec, align 64 @@ -7313,58 +7371,58 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i64_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $3240, %rsp # imm = 0xCA8 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm1 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 192(%rdi), %xmm3 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 176(%rdi), %xmm8 -; SSE-NEXT: movapd 64(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm14 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] +; SSE-NEXT: movapd 208(%rdi), %xmm3 +; SSE-NEXT: movapd 96(%rdi), %xmm2 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 192(%rdi), %xmm6 +; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 128(%rdi), %xmm8 +; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd 64(%rdi), %xmm10 +; SSE-NEXT: movapd (%rdi), %xmm12 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm7 +; SSE-NEXT: movapd 48(%rdi), %xmm0 +; SSE-NEXT: movapd 224(%rdi), %xmm13 +; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 160(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm15, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 272(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7870,7 +7928,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2864(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 2816(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7907,16 +7965,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3024(%rdi), %xmm1 +; SSE-NEXT: movapd 3024(%rdi), %xmm2 ; SSE-NEXT: movapd 3072(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movapd 3088(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd 3088(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 3040(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 3104(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7950,15 +8008,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 3248(%rdi), %xmm9 ; SSE-NEXT: movapd 3296(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] -; SSE-NEXT: movapd 3312(%rdi), %xmm14 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm14[0] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] +; SSE-NEXT: movapd 3312(%rdi), %xmm15 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm15[0] ; SSE-NEXT: movapd 3264(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] ; SSE-NEXT: movapd 3328(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 3280(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7967,48 +8025,48 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3360(%rdi), %xmm5 -; SSE-NEXT: movapd 3408(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: movapd 3360(%rdi), %xmm6 +; SSE-NEXT: movapd 3408(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] ; SSE-NEXT: movapd 3424(%rdi), %xmm11 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm11[0] -; SSE-NEXT: movapd 3376(%rdi), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: movapd 3440(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm3[0] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3392(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3456(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 3472(%rdi), %xmm2 -; SSE-NEXT: movapd 3520(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm11[0] +; SSE-NEXT: movapd 3376(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: movapd 3440(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 3392(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 3456(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 3472(%rdi), %xmm5 +; SSE-NEXT: movapd 3520(%rdi), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: movapd 3536(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0] ; SSE-NEXT: movapd 3488(%rdi), %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] ; SSE-NEXT: movapd 3552(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm0[0] -; SSE-NEXT: movapd 3504(%rdi), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd 3504(%rdi), %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: movapd 3568(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm3[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movapd 3568(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: movapd %xmm1, 496(%rsi) -; SSE-NEXT: movapd %xmm7, 480(%rsi) -; SSE-NEXT: movapd %xmm6, 464(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm2, 496(%rsi) +; SSE-NEXT: movapd %xmm4, 480(%rsi) +; SSE-NEXT: movapd %xmm7, 464(%rsi) ; SSE-NEXT: movapd %xmm10, 448(%rsi) -; SSE-NEXT: movapd %xmm15, 432(%rsi) +; SSE-NEXT: movapd %xmm14, 432(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8017,61 +8075,61 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 384(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 320(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 272(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movapd %xmm2, 496(%rdx) -; SSE-NEXT: movapd %xmm5, 480(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 320(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movapd %xmm5, 496(%rdx) +; SSE-NEXT: movapd %xmm6, 480(%rdx) ; SSE-NEXT: movapd %xmm9, 464(%rdx) ; SSE-NEXT: movapd %xmm12, 448(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 432(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rdx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rdx) @@ -8125,7 +8183,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movapd %xmm8, 496(%rcx) ; SSE-NEXT: movapd %xmm11, 480(%rcx) -; SSE-NEXT: movapd %xmm14, 464(%rcx) +; SSE-NEXT: movapd %xmm15, 464(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8187,7 +8245,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movapd %xmm13, 496(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r8) @@ -8377,7 +8435,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm3, 496(%rax) +; SSE-NEXT: movapd %xmm1, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8445,743 +8503,749 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $4264, %rsp # imm = 0x10A8 -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX1-ONLY-NEXT: subq $4232, %rsp # imm = 0x1088 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2016(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 2064(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2560(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2912(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 3360(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2912(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2960(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3360(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 2288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2784(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2688(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 2736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 3136(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[3],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3232(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 3136(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd 3184(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[3],ymm13[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm13[0],ymm5[0],ymm13[3],ymm5[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[0],ymm6[3],ymm12[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[3],ymm9[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm4[0],ymm10[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm8[0],ymm15[3],ymm8[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm10[0],ymm0[3],ymm10[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm0[0],ymm13[0],ymm0[3],ymm13[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[3],ymm11[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[0],ymm8[3],ymm10[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[3],ymm15[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm11[0],ymm0[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm6[0],ymm15[0],ymm6[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm3[0],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 3376(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 2368(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm9[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[1],ymm14[0],ymm13[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1],ymm12[0],ymm5[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm11[1],ymm13[0],ymm11[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[1],ymm7[0],ymm0[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[1],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm12[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm15[0],ymm0[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm13[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm9[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm14[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 2048(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm12[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm12[0],xmm13[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 2272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm9[0],xmm10[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 2496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm14[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2720(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1920(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2720(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 3040(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2944(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 3264(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 3168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9190,36 +9254,38 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[3],ymm1[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[3],ymm1[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm1 @@ -9228,169 +9294,168 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] ; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[3],ymm13[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[3],ymm15[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[0],ymm10[0],ymm4[3],ymm10[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[0],ymm7[3],ymm11[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm7[0],ymm3[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[0],ymm3[3],ymm9[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm15[0],ymm4[0],ymm15[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[3],ymm5[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 3104(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 3328(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[0],ymm14[3],ymm3[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 3552(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm10[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm7[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] @@ -9552,7 +9617,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) @@ -9599,15 +9664,15 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 448(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 416(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 384(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 320(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 288(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 256(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 256(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9618,63 +9683,62 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $4264, %rsp # imm = 0x10A8 +; AVX1-ONLY-NEXT: addq $4232, %rsp # imm = 0x1088 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3976, %rsp # imm = 0xF88 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-ONLY-NEXT: subq $3928, %rsp # imm = 0xF58 +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2064(%rdi), %xmm1 @@ -9682,221 +9746,220 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2912(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2960(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2960(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3360(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 3408(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1840(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1840(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2688(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3136(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2688(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3136(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -9907,116 +9970,120 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2144(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2592(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 3488(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 3376(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 3264(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 1136(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2816(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 2144(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 2592(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovdqa 2928(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 3488(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovdqa 3376(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 3264(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2368(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq 2816(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq 2368(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq 1920(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 1024(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -10040,213 +10107,207 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 2592(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -10255,520 +10316,525 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2720(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 3040(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2944(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3168(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3392(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3392(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r9) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 448(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 384(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 320(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 288(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 480(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 448(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 416(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 384(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 352(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 320(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 288(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, 256(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3976, %rsp # imm = 0xF88 +; AVX2-ONLY-NEXT: addq $3928, %rsp # imm = 0xF58 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6536, %rsp # imm = 0x1988 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -10776,43 +10842,43 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm18, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 @@ -10820,841 +10886,832 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 ; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm30, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm31 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm19 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm5 -; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm7 -; AVX512F-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm7 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm29 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm26 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm8 +; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm6 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm31 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm4 +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm8, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm5, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm7 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpermi2q %zmm25, %zmm12, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm31, %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm2 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm4 ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm15, %zmm26 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm20, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm23 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm20 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm24 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm28 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm22 ; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <0,7,14,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm19 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm11[4,5,4,5],zmm6[4,5,4,5] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm9, %zmm29 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm13[4,5,4,5],zmm18[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm16[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,7,14,u> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm13[4,5,4,5],zmm29[4,5,4,5] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm0[4,5,4,5],zmm17[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,4,11] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm7[4,5,4,5],zmm21[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm0[4,5,4,5],zmm23[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm17[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[4,5,4,5],zmm30[4,5,4,5] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm19 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm13, %zmm10, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm30, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm2[4,5,4,5],zmm4[4,5,4,5] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <9,0,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm23, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm1, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm22, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm1, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm1, %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm11, %zmm14, %zmm14 -; AVX512F-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm14 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%rsi) +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm13 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm29, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, 448(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, 320(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm0, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm27, 128(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm2, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%r9) @@ -11694,7 +11751,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) @@ -11702,54 +11759,55 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512F-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6536, %rsp # imm = 0x1988 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm28 +; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 @@ -11757,688 +11815,664 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm6 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm10 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm6 -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm7, %zmm15 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm15 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm6 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm11 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm9 +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm4 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm8 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm8 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm7 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm4 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm21 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm19, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,7,14,u> ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm31[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,11,4,11] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm23[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm9[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm24 -; AVX512BW-NEXT: movb $24, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <0,7,14,u> -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm8[4,5,4,5],zmm26[4,5,4,5] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm29 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[4,5,4,5],zmm16[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm13[4,5,4,5],zmm17[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm0[4,5,4,5],zmm15[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm12[4,5,4,5],zmm0[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm0[4,5,4,5],zmm21[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm5[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -12448,13 +12482,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -12475,170 +12503,179 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm23, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm19 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm20, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm20 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm21 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm22 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm23 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm23 = mem[8,9,10,11,12,13,14,15],ymm23[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm23[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm23, %xmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm23, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm24 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm25 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm25, %xmm25 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 256(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm24 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm26, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm10, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm10, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12675,7 +12712,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) @@ -12687,9 +12724,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <448 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index b84fff274cd71..7d29ed1192bd4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -167,69 +167,69 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i64_stride8_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm8 -; SSE-NEXT: movaps 224(%rdi), %xmm10 +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rdi), %xmm11 ; SSE-NEXT: movaps 160(%rdi), %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm12 -; SSE-NEXT: movaps 208(%rdi), %xmm13 +; SSE-NEXT: movaps 80(%rdi), %xmm14 +; SSE-NEXT: movaps 208(%rdi), %xmm15 ; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm12 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps 192(%rdi), %xmm15 -; SSE-NEXT: movaps 128(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 192(%rdi), %xmm13 +; SSE-NEXT: movaps 128(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; SSE-NEXT: movaps 176(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps 176(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm15, (%rsi) -; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps %xmm13, (%rsi) +; SSE-NEXT: movaps %xmm10, 16(%rsi) ; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm9, 16(%rdx) -; SSE-NEXT: movaps %xmm13, (%rcx) -; SSE-NEXT: movaps %xmm14, 16(%rcx) -; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps %xmm12, 16(%rcx) +; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r9) -; SSE-NEXT: movaps %xmm12, 16(%r9) +; SSE-NEXT: movaps %xmm11, (%r9) +; SSE-NEXT: movaps %xmm14, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm8, (%rax) +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movaps %xmm1, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm6, 16(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf4: @@ -419,112 +419,112 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i64_stride8_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 336(%rdi), %xmm11 -; SSE-NEXT: movaps 464(%rdi), %xmm6 -; SSE-NEXT: movaps 400(%rdi), %xmm7 -; SSE-NEXT: movaps 80(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm1 -; SSE-NEXT: movaps 144(%rdi), %xmm8 -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps 256(%rdi), %xmm10 -; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps 336(%rdi), %xmm0 +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps 400(%rdi), %xmm8 +; SSE-NEXT: movaps 80(%rdi), %xmm2 +; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps 144(%rdi), %xmm9 +; SSE-NEXT: movaps 320(%rdi), %xmm4 +; SSE-NEXT: movaps 256(%rdi), %xmm11 +; SSE-NEXT: movaps 448(%rdi), %xmm5 ; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rdi), %xmm6 ; SSE-NEXT: movaps (%rdi), %xmm13 -; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm10 +; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 128(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: movaps %xmm13, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 272(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm11[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdi), %xmm0 -; SSE-NEXT: movaps 160(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps 288(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm1 -; SSE-NEXT: movaps 416(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 304(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 304(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 432(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) @@ -547,27 +547,27 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm11, 32(%r9) -; SSE-NEXT: movaps %xmm8, 48(%r9) +; SSE-NEXT: movaps %xmm12, 32(%r9) +; SSE-NEXT: movaps %xmm11, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 48(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm13, 16(%rax) -; SSE-NEXT: movaps %xmm12, (%rax) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm5, 32(%rax) -; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) +; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movaps %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm3, 48(%rax) +; SSE-NEXT: movaps %xmm4, 32(%rax) ; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm10, (%rax) +; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm1, 48(%rax) +; SSE-NEXT: movaps %xmm2, 32(%rax) +; SSE-NEXT: movaps %xmm5, 16(%rax) +; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; @@ -602,17 +602,17 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] @@ -630,79 +630,79 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %xmm11, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX1-ONLY-NEXT: addq $184, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -713,13 +713,13 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 @@ -736,52 +736,52 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm2[1],ymm15[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) @@ -798,11 +798,11 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) @@ -815,229 +815,229 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-LABEL: load_i64_stride8_vf8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm5 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm3, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm3, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm12 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm9 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm13 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[4],zmm3[6],zmm8[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm8[1],zmm3[3],zmm8[3],zmm3[5],zmm8[5],zmm3[7],zmm8[7] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm15, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm17 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm8, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm9[0],zmm6[0],zmm9[2],zmm6[2],zmm9[4],zmm6[4],zmm9[6],zmm6[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm9[1],zmm6[1],zmm9[3],zmm6[3],zmm9[5],zmm6[5],zmm9[7],zmm6[7] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm9 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm14, %ymm14 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm9[0],ymm14[0],ymm9[2],ymm14[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 ; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] ; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm1, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r10) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdi) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm5 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm3, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm12 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1} +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm15[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm9 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm13 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm13 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm3[0],zmm8[0],zmm3[2],zmm8[2],zmm3[4],zmm8[4],zmm3[6],zmm8[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm8[1],zmm3[3],zmm8[3],zmm3[5],zmm8[5],zmm3[7],zmm8[7] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm3[0],zmm12[0],zmm3[2],zmm12[2],zmm3[4],zmm12[4],zmm3[6],zmm12[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm3[1],zmm12[1],zmm3[3],zmm12[3],zmm3[5],zmm12[5],zmm3[7],zmm12[7] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm15, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm17 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm9[0],zmm6[0],zmm9[2],zmm6[2],zmm9[4],zmm6[4],zmm9[6],zmm6[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm9[1],zmm6[1],zmm9[3],zmm6[3],zmm9[5],zmm6[5],zmm9[7],zmm6[7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm11[0],zmm7[0],zmm11[2],zmm7[2],zmm11[4],zmm7[4],zmm11[6],zmm7[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm11[1],zmm7[1],zmm11[3],zmm7[3],zmm11[5],zmm7[5],zmm11[7],zmm7[7] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm9 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13] -; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm14, %ymm14 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm9[0],ymm14[0],ymm9[2],ymm14[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm11 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm17, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm2 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%r10) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdi) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <64 x i64>, ptr %in.vec, align 64 @@ -1205,29 +1205,31 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 608(%rdi), %xmm0 -; SSE-NEXT: movaps 544(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 544(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 672(%rdi), %xmm12 +; SSE-NEXT: movaps 672(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 864(%rdi), %xmm0 +; SSE-NEXT: movaps 800(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps 800(%rdi), %xmm9 +; SSE-NEXT: movaps 992(%rdi), %xmm0 +; SSE-NEXT: movaps 928(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 992(%rdi), %xmm0 -; SSE-NEXT: movaps 928(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1236,43 +1238,41 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 304(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps 432(%rdi), %xmm14 +; SSE-NEXT: movaps 176(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 304(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm0 +; SSE-NEXT: movaps 432(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 624(%rdi), %xmm0 ; SSE-NEXT: movaps 560(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 688(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 688(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 816(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps 1008(%rdi), %xmm0 -; SSE-NEXT: movaps 944(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 944(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1354,10 +1354,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, 112(%rax) -; SSE-NEXT: movaps %xmm9, 96(%rax) -; SSE-NEXT: movaps %xmm12, 80(%rax) -; SSE-NEXT: movaps %xmm15, 64(%rax) +; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movaps %xmm12, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1367,26 +1369,24 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 112(%rax) -; SSE-NEXT: movaps %xmm6, 96(%rax) -; SSE-NEXT: movaps %xmm8, 80(%rax) -; SSE-NEXT: movaps %xmm11, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps %xmm4, 96(%rax) +; SSE-NEXT: movaps %xmm7, 80(%rax) +; SSE-NEXT: movaps %xmm8, 64(%rax) +; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm15, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps %xmm2, 112(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm2, 80(%rax) +; SSE-NEXT: movaps %xmm6, 80(%rax) ; SSE-NEXT: movaps %xmm5, 64(%rax) -; SSE-NEXT: movaps %xmm14, 48(%rax) -; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm13, 32(%rax) +; SSE-NEXT: movaps %xmm14, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 @@ -1394,7 +1394,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $792, %rsp # imm = 0x318 +; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -1524,13 +1524,13 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -1543,239 +1543,239 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm9[0],ymm4[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm9[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) -; AVX1-ONLY-NEXT: addq $792, %rsp # imm = 0x318 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i64_stride8_vf16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i64_stride8_vf16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $808, %rsp # imm = 0x328 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm9[1],ymm3[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm12 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 @@ -1812,50 +1812,50 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm8[1],ymm5[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) @@ -1907,7 +1907,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1918,72 +1918,72 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rax) ; AVX2-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride8_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $200, %rsp -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm28 +; AVX512F-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 ; AVX512F-NEXT: vmovaps 640(%rdi), %zmm0 ; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512F-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm24 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm31, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm31, %zmm15 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm19 -; AVX512F-NEXT: vinserti32x4 $1, 128(%rdi), %ymm7, %ymm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm31, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm29, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 {%k1} -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21 +; AVX512F-NEXT: vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2] ; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm29, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm2 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -1991,54 +1991,54 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm20[1],ymm0[1],ymm20[3],ymm0[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3] ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm24 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm29, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm22 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm29 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm30, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm5[0],zmm13[0],zmm5[2],zmm13[2],zmm5[4],zmm13[4],zmm5[6],zmm13[6] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm6 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm6 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm29[0],zmm10[0],zmm29[2],zmm10[2],zmm29[4],zmm10[4],zmm29[6],zmm10[6] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm28, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm31, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm18, %zmm8, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2046,163 +2046,166 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] ; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm5[1],zmm13[1],zmm5[3],zmm13[3],zmm5[5],zmm13[5],zmm5[7],zmm13[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm14[1],zmm12[1],zmm14[3],zmm12[3],zmm14[5],zmm12[5],zmm14[7],zmm12[7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm23, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm29, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm10[1],zmm29[3],zmm10[3],zmm29[5],zmm10[5],zmm29[7],zmm10[7] -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm11[1],zmm9[1],zmm11[3],zmm9[3],zmm11[5],zmm9[5],zmm11[7],zmm9[7] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm30 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm24 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm23, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm28, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm21, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm28, %zmm0, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm10, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm8, %zmm7 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm16 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm12 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm15 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm21, %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15] ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm11 ; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm11, %ymm11 -; AVX512F-NEXT: vpermi2q %zmm28, %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm10, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 ; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm31, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm24 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm24, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm29 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm29, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm25, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-NEXT: addq $200, %rsp +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $200, %rsp -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm28 +; AVX512BW-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 ; AVX512BW-NEXT: vmovaps 640(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm21 -; AVX512BW-NEXT: vmovaps 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm31, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm15 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm19 -; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm7, %ymm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm18[0],ymm19[0],ymm18[2],ymm19[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm31, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm29, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 {%k1} -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm16, %ymm21 +; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm0, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm21[0],ymm16[2],ymm21[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm20[0],ymm5[0],ymm20[2],ymm5[2] ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm2 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -2210,54 +2213,54 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm27[0],ymm25[0],ymm27[2],ymm25[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm20[1],ymm0[1],ymm20[3],ymm0[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm20[1],ymm5[1],ymm20[3],ymm5[3] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm24 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm22 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm7[2,3],ymm5[2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm29 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm30, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm25[1],ymm27[3],ymm25[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm5[0],zmm13[0],zmm5[2],zmm13[2],zmm5[4],zmm13[4],zmm5[6],zmm13[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm24[0],zmm14[0],zmm24[2],zmm14[2],zmm24[4],zmm14[4],zmm24[6],zmm14[6] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm6 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm29[0],zmm10[0],zmm29[2],zmm10[2],zmm29[4],zmm10[4],zmm29[6],zmm10[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm28[0],zmm11[0],zmm28[2],zmm11[2],zmm28[4],zmm11[4],zmm28[6],zmm11[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm8, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] @@ -2265,99 +2268,102 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm5[1],zmm13[1],zmm5[3],zmm13[3],zmm5[5],zmm13[5],zmm5[7],zmm13[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm14[1],zmm12[1],zmm14[3],zmm12[3],zmm14[5],zmm12[5],zmm14[7],zmm12[7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm29, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm29[1],zmm10[1],zmm29[3],zmm10[3],zmm29[5],zmm10[5],zmm29[7],zmm10[7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k1} = zmm11[0],zmm9[0],zmm11[2],zmm9[2],zmm11[4],zmm9[4],zmm11[6],zmm9[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm29 {%k1} = zmm11[1],zmm9[1],zmm11[3],zmm9[3],zmm11[5],zmm9[5],zmm11[7],zmm9[7] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm30 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm14[1],zmm24[3],zmm14[3],zmm24[5],zmm14[5],zmm24[7],zmm14[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm13[0],zmm12[0],zmm13[2],zmm12[2],zmm13[4],zmm12[4],zmm13[6],zmm12[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm13[1],zmm12[1],zmm13[3],zmm12[3],zmm13[5],zmm12[5],zmm13[7],zmm12[7] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm23, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm28, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm28[1],zmm11[1],zmm28[3],zmm11[3],zmm28[5],zmm11[5],zmm28[7],zmm11[7] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm8, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm16 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm15 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm2 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [7,15,7,15] ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm11 ; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm24 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm19[1],ymm18[3],ymm19[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm24, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm12, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm16[1],ymm21[1],ymm16[3],ymm21[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm29, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: addq $200, %rsp +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -2385,17 +2391,17 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: subq $1688, %rsp # imm = 0x698 ; SSE-NEXT: movaps 832(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm1 +; SSE-NEXT: movaps 320(%rdi), %xmm2 ; SSE-NEXT: movaps 256(%rdi), %xmm8 -; SSE-NEXT: movaps 960(%rdi), %xmm2 +; SSE-NEXT: movaps 960(%rdi), %xmm1 ; SSE-NEXT: movaps 896(%rdi), %xmm10 -; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps 448(%rdi), %xmm4 ; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 576(%rdi), %xmm4 +; SSE-NEXT: movaps 576(%rdi), %xmm3 ; SSE-NEXT: movaps 512(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps 64(%rdi), %xmm6 ; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 704(%rdi), %xmm6 +; SSE-NEXT: movaps 704(%rdi), %xmm5 ; SSE-NEXT: movaps 640(%rdi), %xmm14 ; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 128(%rdi), %xmm13 @@ -2405,34 +2411,34 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] +; SSE-NEXT: movaps %xmm9, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 768(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2703,7 +2709,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 1696(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1888(%rdi), %xmm0 @@ -2714,13 +2720,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2016(%rdi), %xmm0 -; SSE-NEXT: movaps 1952(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps 1952(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2760,7 +2767,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2769,54 +2776,53 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm0 -; SSE-NEXT: movaps 944(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 944(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 1136(%rdi), %xmm0 -; SSE-NEXT: movaps 1072(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 1072(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 1264(%rdi), %xmm0 -; SSE-NEXT: movaps 1200(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps 1200(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 1392(%rdi), %xmm0 -; SSE-NEXT: movaps 1328(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 1328(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 1520(%rdi), %xmm0 -; SSE-NEXT: movaps 1456(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 1648(%rdi), %xmm0 -; SSE-NEXT: movaps 1584(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps 1456(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 1776(%rdi), %xmm0 -; SSE-NEXT: movaps 1712(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: movaps 1648(%rdi), %xmm0 +; SSE-NEXT: movaps 1584(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 1776(%rdi), %xmm0 +; SSE-NEXT: movaps 1712(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps 1904(%rdi), %xmm0 ; SSE-NEXT: movaps 1840(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps 2032(%rdi), %xmm0 -; SSE-NEXT: movaps 1968(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 1968(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2949,7 +2955,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm0, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) @@ -2978,7 +2984,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm14, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3010,13 +3017,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 240(%rax) -; SSE-NEXT: movaps %xmm5, 224(%rax) -; SSE-NEXT: movaps %xmm9, 208(%rax) -; SSE-NEXT: movaps %xmm11, 192(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rax) +; SSE-NEXT: movaps %xmm3, 240(%rax) +; SSE-NEXT: movaps %xmm6, 224(%rax) +; SSE-NEXT: movaps %xmm8, 208(%rax) +; SSE-NEXT: movaps %xmm9, 192(%rax) +; SSE-NEXT: movaps %xmm10, 176(%rax) +; SSE-NEXT: movaps %xmm15, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3038,18 +3044,18 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 240(%rax) +; SSE-NEXT: movaps %xmm2, 240(%rax) ; SSE-NEXT: movaps %xmm1, 224(%rax) -; SSE-NEXT: movaps %xmm2, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) -; SSE-NEXT: movaps %xmm8, 176(%rax) -; SSE-NEXT: movaps %xmm6, 160(%rax) -; SSE-NEXT: movaps %xmm10, 144(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps %xmm4, 208(%rax) +; SSE-NEXT: movaps %xmm5, 192(%rax) +; SSE-NEXT: movaps %xmm7, 176(%rax) +; SSE-NEXT: movaps %xmm13, 160(%rax) +; SSE-NEXT: movaps %xmm14, 144(%rax) +; SSE-NEXT: movaps %xmm11, 128(%rax) +; SSE-NEXT: movaps %xmm12, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -3066,49 +3072,49 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm7[0],xmm5[0] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0] @@ -3163,136 +3169,136 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 @@ -3388,7 +3394,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 @@ -3399,354 +3405,358 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm15[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vzeroupper -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i64_stride8_vf32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2248, %rsp # imm = 0x8C8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $2216, %rsp # imm = 0x8A8 +; AVX1-ONLY-NEXT: vzeroupper +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i64_stride8_vf32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $2248, %rsp # imm = 0x8C8 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] @@ -3759,7 +3769,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3775,136 +3785,136 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 @@ -3971,133 +3981,132 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) @@ -4215,11 +4224,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4232,189 +4240,186 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-LABEL: load_i64_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2440, %rsp # imm = 0x988 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm16 -; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm16[0],ymm12[2],ymm16[2] -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm6 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm14[2,3],ymm8[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm8 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm10 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %ymm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm31 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm16[1],ymm12[3],ymm16[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm9 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm11 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm14 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm25[0],zmm24[0],zmm25[2],zmm24[2],zmm25[4],zmm24[4],zmm25[6],zmm24[6] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm16[0],zmm13[2],zmm16[2],zmm13[4],zmm16[4],zmm13[6],zmm16[6] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm31[0],zmm17[0],zmm31[2],zmm17[2],zmm31[4],zmm17[4],zmm31[6],zmm17[6] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -4428,507 +4433,508 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm10[0],zmm30[2],zmm10[2],zmm30[4],zmm10[4],zmm30[6],zmm10[6] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm25[1],zmm24[1],zmm25[3],zmm24[3],zmm25[5],zmm24[5],zmm25[7],zmm24[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm24, %zmm9, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 ; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm13 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,5,13] -; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 = zmm1[1],zmm19[1],zmm1[3],zmm19[3],zmm1[5],zmm19[5],zmm1[7],zmm19[7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm23[0],zmm17[0],zmm23[2],zmm17[2],zmm23[4],zmm17[4],zmm23[6],zmm17[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm17[1],zmm23[3],zmm17[3],zmm23[5],zmm17[5],zmm23[7],zmm17[7] -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm30, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] +; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm8, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm29 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm30, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm31 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm0 # 32-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm14, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm11 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm11, %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm15 -; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm15, %ymm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm14, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm22 -; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm22, %ymm22 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm24, %ymm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm26, %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm8 +; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 +; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm25 +; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm26 +; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 ; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm1, %zmm14 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm15 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, (%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm4, 128(%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, (%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512F-NEXT: vmovaps %zmm14, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512F-NEXT: addq $2440, %rsp # imm = 0x988 +; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512F-NEXT: vmovaps %zmm10, 64(%rax) +; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2440, %rsp # imm = 0x988 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm16 -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm16[0],ymm12[2],ymm16[2] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm29 +; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm14[2,3],ymm8[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm29 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm26[0],ymm20[0],ymm26[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm10 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm31 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm16[1],ymm12[3],ymm16[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm9 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm11 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm14 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm26[1],ymm20[1],ymm26[3],ymm20[3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm25[0],zmm24[0],zmm25[2],zmm24[2],zmm25[4],zmm24[4],zmm25[6],zmm24[6] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm16[0],zmm13[2],zmm16[2],zmm13[4],zmm16[4],zmm13[6],zmm16[6] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm31[0],zmm17[0],zmm31[2],zmm17[2],zmm31[4],zmm17[4],zmm31[6],zmm17[6] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 @@ -4942,319 +4948,323 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm10[0],zmm30[2],zmm10[2],zmm30[4],zmm10[4],zmm30[6],zmm10[6] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm25[1],zmm24[1],zmm25[3],zmm24[3],zmm25[5],zmm24[5],zmm25[7],zmm24[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm28[0],zmm22[0],zmm28[2],zmm22[2],zmm28[4],zmm22[4],zmm28[6],zmm22[6] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 {%k1} = zmm28[1],zmm22[1],zmm28[3],zmm22[3],zmm28[5],zmm22[5],zmm28[7],zmm22[7] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm13 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,5,13] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 = zmm1[1],zmm19[1],zmm1[3],zmm19[3],zmm1[5],zmm19[5],zmm1[7],zmm19[7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm23[0],zmm17[0],zmm23[2],zmm17[2],zmm23[4],zmm17[4],zmm23[6],zmm17[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm23[1],zmm17[1],zmm23[3],zmm17[3],zmm23[5],zmm17[5],zmm23[7],zmm17[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm30, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm8, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm29 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm17 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm7[1],zmm5[1],zmm7[3],zmm5[3],zmm7[5],zmm5[5],zmm7[7],zmm5[7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm0 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm14, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm11 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm11, %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm15 -; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm15, %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm14, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm22 -; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm22, %ymm22 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm24, %ymm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[2],ymm22[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm8 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm25 +; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm26 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm11 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm24[1],ymm22[1],ymm24[3],ymm22[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 ; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm14, %zmm1, %zmm14 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm15 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, (%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm4, 128(%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, (%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm3, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512BW-NEXT: vmovaps %zmm14, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-NEXT: addq $2440, %rsp # imm = 0x988 +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512BW-NEXT: vmovaps %zmm10, 64(%rax) +; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 @@ -6107,29 +6117,29 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 2928(%rdi), %xmm0 -; SSE-NEXT: movaps 2864(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps 2864(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 3056(%rdi), %xmm0 -; SSE-NEXT: movaps 2992(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 2992(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 3184(%rdi), %xmm0 -; SSE-NEXT: movaps 3120(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 3120(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 3312(%rdi), %xmm0 -; SSE-NEXT: movaps 3248(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps 3248(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 3440(%rdi), %xmm0 ; SSE-NEXT: movaps 3376(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm1 @@ -6137,19 +6147,19 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 3568(%rdi), %xmm0 -; SSE-NEXT: movaps 3504(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 4016(%rdi), %xmm2 -; SSE-NEXT: movaps 3952(%rdi), %xmm4 +; SSE-NEXT: movaps 3504(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 4016(%rdi), %xmm4 +; SSE-NEXT: movaps 3952(%rdi), %xmm3 ; SSE-NEXT: movaps 3696(%rdi), %xmm0 -; SSE-NEXT: movaps 3632(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps 3632(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movaps 4080(%rdi), %xmm1 ; SSE-NEXT: movaps 3888(%rdi), %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps 3824(%rdi), %xmm6 ; SSE-NEXT: movaps 3760(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm7, 496(%rsi) @@ -6159,11 +6169,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, 480(%rsi) ; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 464(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6546,11 +6556,11 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 496(%rax) +; SSE-NEXT: movaps %xmm3, 496(%rax) ; SSE-NEXT: movaps %xmm6, 480(%rax) ; SSE-NEXT: movaps %xmm7, 464(%rax) -; SSE-NEXT: movaps %xmm11, 448(%rax) -; SSE-NEXT: movaps %xmm14, 432(%rax) +; SSE-NEXT: movaps %xmm10, 448(%rax) +; SSE-NEXT: movaps %xmm13, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 416(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6606,16 +6616,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 496(%rax) +; SSE-NEXT: movaps %xmm4, 496(%rax) ; SSE-NEXT: movaps %xmm5, 480(%rax) ; SSE-NEXT: movaps %xmm0, 464(%rax) -; SSE-NEXT: movaps %xmm3, 448(%rax) -; SSE-NEXT: movaps %xmm10, 432(%rax) +; SSE-NEXT: movaps %xmm2, 448(%rax) +; SSE-NEXT: movaps %xmm8, 432(%rax) ; SSE-NEXT: movaps %xmm12, 416(%rax) -; SSE-NEXT: movaps %xmm8, 400(%rax) -; SSE-NEXT: movaps %xmm9, 384(%rax) -; SSE-NEXT: movaps %xmm13, 368(%rax) -; SSE-NEXT: movaps %xmm15, 352(%rax) +; SSE-NEXT: movaps %xmm9, 400(%rax) +; SSE-NEXT: movaps %xmm15, 384(%rax) +; SSE-NEXT: movaps %xmm11, 368(%rax) +; SSE-NEXT: movaps %xmm14, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6665,7 +6675,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $4984, %rsp # imm = 0x1378 +; AVX1-ONLY-NEXT: subq $5016, %rsp # imm = 0x1398 ; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -6858,236 +6868,232 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2384(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2832(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 2832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3920(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3856(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 3920(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 3856(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm8[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm11[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7101,9 +7107,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7117,9 +7121,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7133,9 +7135,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7149,9 +7149,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 @@ -7207,7 +7213,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7231,7 +7237,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7346,655 +7352,659 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm9[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2720(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2608(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm10[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2864(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2720(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3296(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2672(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2608(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3120(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3552(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2864(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3296(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3808(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 3120(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3744(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3632(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3552(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 3440(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4064(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3808(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 3744(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 3696(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 3632(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4000(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3952(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3888(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 4064(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 4000(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 3952(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3888(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm8[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm9[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8011,7 +8021,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $4984, %rsp # imm = 0x1378 +; AVX1-ONLY-NEXT: addq $5016, %rsp # imm = 0x1398 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -8148,259 +8158,258 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 3264(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 3200(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 3776(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 3712(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] @@ -8415,7 +8424,8 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8427,7 +8437,7 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8568,180 +8578,180 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3040(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3168(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3104(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3168(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3296(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 3232(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 3104(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3296(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 3232(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3424(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 3552(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 3488(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 3424(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 3552(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 3488(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3680(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 3616(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 3808(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 3744(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 3680(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 3616(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 3808(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 3744(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 3936(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 3872(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 3872(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 4064(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 4000(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8753,10 +8763,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8839,19 +8849,19 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm6[1],ymm13[3],ymm6[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) @@ -9081,9 +9091,9 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 384(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) @@ -9114,98 +9124,105 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-LABEL: load_i64_stride8_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %ymm30 +; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm30[0],ymm0[2],ymm30[2] -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-NEXT: vmovdqa 3136(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm31[0],ymm3[2],ymm31[2] +; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm14 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm3[0],ymm26[2],ymm3[2] +; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm23 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm23[0],ymm19[2],ymm23[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm18[0],ymm12[2],ymm18[2] +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm25[0],ymm16[2],ymm25[2] +; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm21[0],ymm20[2],ymm21[2] +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9216,17 +9233,18 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm27 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm24 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9241,13 +9259,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 2624(%rdi), %ymm10 -; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %ymm31 +; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9256,20 +9275,18 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 2240(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 2176(%rdi), %ymm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %ymm22 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %ymm28 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9278,148 +9295,146 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 3776(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 3776(%rdi), %ymm12 +; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm30[1],ymm13[3],ymm30[3] -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm13[1],ymm31[1],ymm13[3],ymm31[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm14 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm26[1],mem[1],ymm26[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm19[1],ymm23[1],ymm19[3],ymm23[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm12[1],ymm18[1],ymm12[3],ymm18[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm16[1],ymm25[1],ymm16[3],ymm25[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm21[1],ymm20[3],ymm21[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm22, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm22[1],mem[1],ymm22[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm24[1],ymm8[3],ymm24[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm4[0],zmm24[2],zmm4[2],zmm24[4],zmm4[4],zmm24[6],zmm4[6] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm4 @@ -9430,15 +9445,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm31[0],mem[0],zmm31[2],mem[2],zmm31[4],mem[4],zmm31[6],mem[6] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 @@ -9448,72 +9465,72 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm26[0],zmm6[2],zmm26[2],zmm6[4],zmm26[4],zmm6[6],zmm26[6] +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm18[0],zmm12[2],zmm18[2],zmm12[4],zmm18[4],zmm12[6],zmm18[6] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm30 ; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm23 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm16 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm28 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9528,16 +9545,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm13[0],zmm11[0],zmm13[2],zmm11[2],zmm13[4],zmm11[4],zmm13[6],zmm11[6] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqa64 3648(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3584(%rdi), %zmm3 @@ -9549,9 +9564,9 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm9[0],zmm1[2],zmm9[2],zmm1[4],zmm9[4],zmm1[6],zmm9[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] @@ -9561,118 +9576,117 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm22[1],zmm25[3],zmm22[3],zmm25[5],zmm22[5],zmm25[7],zmm22[7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm19[1],mem[1],zmm19[3],mem[3],zmm19[5],mem[5],zmm19[7],mem[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm24[1],mem[1],zmm24[3],mem[3],zmm24[5],mem[5],zmm24[7],mem[7] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm8[1],mem[1],zmm8[3],mem[3],zmm8[5],mem[5],zmm8[7],mem[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm1 = zmm1[1],mem[1],zmm1[3],mem[3],zmm1[5],mem[5],zmm1[7],mem[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9683,236 +9697,241 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm28[0],zmm29[2],zmm28[2],zmm29[4],zmm28[4],zmm29[6],zmm28[6] +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm8 ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm6[0],zmm11[0],zmm6[2],zmm11[2],zmm6[4],zmm11[4],zmm6[6],zmm11[6] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm25[1],zmm2[1],zmm25[3],zmm2[3],zmm25[5],zmm2[5],zmm25[7],zmm2[7] -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm24 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 ; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm19 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm31[1],zmm27[3],zmm31[3],zmm27[5],zmm31[5],zmm27[7],zmm31[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm29[1],zmm28[1],zmm29[3],zmm28[3],zmm29[5],zmm28[5],zmm29[7],zmm28[7] -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm8, %zmm31 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm14[1],zmm21[1],zmm14[3],zmm21[3],zmm14[5],zmm21[5],zmm14[7],zmm21[7] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] ; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm6, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm9, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm18[1],zmm11[1],zmm18[3],zmm11[3],zmm18[5],zmm11[5],zmm18[7],zmm11[7] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -9923,165 +9942,160 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm20 -; AVX512F-NEXT: vinserti32x4 $1, 192(%rdi), %ymm20, %ymm20 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm0[0],ymm20[0],ymm0[2],ymm20[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm25 -; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm25, %ymm25 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm30 -; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm30, %ymm30 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm30[0],ymm25[0],ymm30[2],ymm25[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm16, %ymm16 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm27 -; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm27, %ymm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm27[0],ymm16[0],ymm27[2],ymm16[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm17, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm9, %ymm9 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm19 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm19, %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm19[0],ymm9[0],ymm19[2],ymm9[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm24, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa 2112(%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, 2240(%rdi), %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm24, %ymm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm24[0],ymm5[0],ymm24[2],ymm5[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm13 -; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm13, %ymm13 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm28 -; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm7, %ymm7 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm26 -; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm26, %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm26[0],ymm7[0],ymm26[2],ymm7[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm17, %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm17 -; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm17, %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm17[0],ymm10[0],ymm17[2],ymm10[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm26[1],ymm7[1],ymm26[3],ymm7[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm30[1],ymm25[1],ymm30[3],ymm25[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm20[1],ymm0[3],ymm20[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm9[1],ymm19[3],ymm9[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm27[1],ymm16[1],ymm27[3],ymm16[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm24[1],ymm5[1],ymm24[3],ymm5[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm17[1],ymm10[1],ymm17[3],ymm10[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm24 +; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm24 +; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm9 +; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm17 +; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %xmm19 +; AVX512F-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm29 +; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm20 +; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 +; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm21 +; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 ; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10165,13 +10179,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10185,98 +10201,105 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride8_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %ymm30 +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm30[0],ymm0[2],ymm30[2] -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %ymm31 -; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm31[0],ymm3[2],ymm31[2] +; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm3[0],ymm26[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm23 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm23[0],ymm19[2],ymm23[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm18[0],ymm12[2],ymm18[2] +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm30 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm25[0],ymm16[2],ymm25[2] +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm21[0],ymm20[2],ymm21[2] +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10287,17 +10310,18 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm27 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10312,13 +10336,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %ymm10 -; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm31 +; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10327,20 +10352,18 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %ymm28 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10349,148 +10372,146 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %ymm24 -; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 3776(%rdi), %ymm12 +; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm30[1],ymm13[3],ymm30[3] -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm13[1],ymm31[1],ymm13[3],ymm31[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm14 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm15 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm15 = ymm26[1],mem[1],ymm26[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm19[1],ymm23[1],ymm19[3],ymm23[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm12[1],ymm18[1],ymm12[3],ymm18[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm16[1],ymm25[1],ymm16[3],ymm25[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm20[1],ymm21[1],ymm20[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm28[1],ymm27[1],ymm28[3],ymm27[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm22, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm22[1],mem[1],ymm22[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm24[1],ymm8[3],ymm24[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm4[0],zmm24[2],zmm4[2],zmm24[4],zmm4[4],zmm24[6],zmm4[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm4 @@ -10501,15 +10522,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm31[0],mem[0],zmm31[2],mem[2],zmm31[4],mem[4],zmm31[6],mem[6] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 @@ -10519,72 +10542,72 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm26[0],zmm6[2],zmm26[2],zmm6[4],zmm26[4],zmm6[6],zmm26[6] +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm18[0],zmm12[2],zmm18[2],zmm12[4],zmm18[4],zmm12[6],zmm18[6] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm20[0],zmm16[2],zmm20[2],zmm16[4],zmm20[4],zmm16[6],zmm20[6] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10599,16 +10622,14 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm13[0],zmm11[0],zmm13[2],zmm11[2],zmm13[4],zmm11[4],zmm13[6],zmm11[6] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm3 @@ -10620,9 +10641,9 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm9[0],zmm1[2],zmm9[2],zmm1[4],zmm9[4],zmm1[6],zmm9[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] @@ -10632,118 +10653,117 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm22[1],zmm25[3],zmm22[3],zmm25[5],zmm22[5],zmm25[7],zmm22[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm19[1],mem[1],zmm19[3],mem[3],zmm19[5],mem[5],zmm19[7],mem[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm24[1],mem[1],zmm24[3],mem[3],zmm24[5],mem[5],zmm24[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm8[1],mem[1],zmm8[3],mem[3],zmm8[5],mem[5],zmm8[7],mem[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm8[1],zmm7[1],zmm8[3],zmm7[3],zmm8[5],zmm7[5],zmm8[7],zmm7[7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm1 = zmm1[1],mem[1],zmm1[3],mem[3],zmm1[5],mem[5],zmm1[7],mem[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10754,236 +10774,241 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 {%k1} = zmm17[0],mem[0],zmm17[2],mem[2],zmm17[4],mem[4],zmm17[6],mem[6] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 {%k1} = zmm27[0],mem[0],zmm27[2],mem[2],zmm27[4],mem[4],zmm27[6],mem[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm18[0],zmm27[2],zmm18[2],zmm27[4],zmm18[4],zmm27[6],zmm18[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm31[0],zmm27[2],zmm31[2],zmm27[4],zmm31[4],zmm27[6],zmm31[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm28[0],zmm29[2],zmm28[2],zmm29[4],zmm28[4],zmm29[6],zmm28[6] +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm21[0],zmm14[2],zmm21[2],zmm14[4],zmm21[4],zmm14[6],zmm21[6] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm8 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm6[0],zmm11[0],zmm6[2],zmm11[2],zmm6[4],zmm11[4],zmm6[6],zmm11[6] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm25[1],zmm2[1],zmm25[3],zmm2[3],zmm25[5],zmm2[5],zmm25[7],zmm2[7] -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm24 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm19 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm31[1],zmm27[3],zmm31[3],zmm27[5],zmm31[5],zmm27[7],zmm31[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm29[1],zmm28[1],zmm29[3],zmm28[3],zmm29[5],zmm28[5],zmm29[7],zmm28[7] -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm8, %zmm31 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm14[1],zmm21[1],zmm14[3],zmm21[3],zmm14[5],zmm21[5],zmm14[7],zmm21[7] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] ; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm9, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm18[1],zmm11[1],zmm18[3],zmm11[3],zmm18[5],zmm11[5],zmm18[7],zmm11[7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm27 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm2[1],zmm4[1],zmm2[3],zmm4[3],zmm2[5],zmm4[5],zmm2[7],zmm4[7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm11 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload @@ -10994,165 +11019,160 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm18 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm20 -; AVX512BW-NEXT: vinserti32x4 $1, 192(%rdi), %ymm20, %ymm20 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm0[0],ymm20[0],ymm0[2],ymm20[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm25 -; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm25, %ymm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm30 -; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm30, %ymm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm30[0],ymm25[0],ymm30[2],ymm25[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm16, %ymm16 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm27 -; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm27, %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm27[0],ymm16[0],ymm27[2],ymm16[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm19 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm19, %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm19[0],ymm9[0],ymm19[2],ymm9[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm24, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm24, %ymm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm24[0],ymm5[0],ymm24[2],ymm5[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm13 -; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm13, %ymm13 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm28 -; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm28, %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm13[0],ymm28[2],ymm13[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm26[0],ymm7[0],ymm26[2],ymm7[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm17, %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm17[0],ymm10[0],ymm17[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm26[1],ymm7[1],ymm26[3],ymm7[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm30[1],ymm25[1],ymm30[3],ymm25[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm20[1],ymm0[3],ymm20[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm9[1],ymm19[3],ymm9[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm27[1],ymm16[1],ymm27[3],ymm16[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm28[1],ymm13[1],ymm28[3],ymm13[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm24[1],ymm5[1],ymm24[3],ymm5[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm17[1],ymm10[1],ymm17[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm24 +; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm24 +; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm9 +; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %xmm19 +; AVX512BW-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm26 +; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm29 +; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 +; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm21 +; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11236,13 +11256,15 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index f0118bc3b33b6..19c97ece8937f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -396,238 +396,237 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm14, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm11 -; SSE-NEXT: por %xmm9, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm13[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] -; SSE-NEXT: packuswb %xmm11, %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pand %xmm15, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm11, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pand %xmm7, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: por %xmm14, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm14, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: por %xmm13, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,1,3] +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm12, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movdqa %xmm6, 16(%rdx) -; SSE-NEXT: movdqa %xmm8, (%rdx) -; SSE-NEXT: movdqa %xmm3, 16(%rcx) -; SSE-NEXT: movdqa %xmm11, (%rcx) +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm12, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm11, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movdqa %xmm7, 16(%rdx) +; SSE-NEXT: movdqa %xmm3, (%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride3_vf32: @@ -768,592 +767,597 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i8_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] +; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: packuswb %xmm0, %xmm9 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,0] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE-NEXT: packuswb %xmm0, %xmm9 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm9, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm13, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm13, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm9, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm13[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: por %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: pand %xmm12, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: por %xmm11, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm11, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: pand %xmm2, %xmm14 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] ; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movdqa %xmm6, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm13, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movdqa %xmm5, 32(%rcx) -; SSE-NEXT: movdqa %xmm8, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) -; SSE-NEXT: addq $152, %rsp +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movdqa %xmm7, 32(%rdx) +; SSE-NEXT: movdqa %xmm13, 48(%rdx) +; SSE-NEXT: movdqa %xmm10, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movdqa %xmm4, 32(%rcx) +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm9, (%rcx) +; SSE-NEXT: movdqa %xmm5, 16(%rcx) +; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 16(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 32(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 16(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index d995051642643..764020b3b48c6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -297,12 +297,12 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: packuswb %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm3 @@ -310,72 +310,72 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: packuswb %xmm8, %xmm14 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: packuswb %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm12[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] ; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: packuswb %xmm15, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm14[0,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: packuswb %xmm15, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm14[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm11, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm13, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm13 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm8[0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] @@ -383,24 +383,24 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; SSE-NEXT: packuswb %xmm3, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] ; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: movaps %xmm9, (%rdx) +; SSE-NEXT: movaps %xmm10, (%rdx) ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: retq @@ -528,37 +528,38 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $120, %rsp +; SSE-NEXT: subq $136, %rsp ; SSE-NEXT: movdqa 64(%rdi), %xmm4 -; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa 96(%rdi), %xmm15 -; SSE-NEXT: movdqa 112(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm1, %xmm6 ; SSE-NEXT: pxor %xmm4, %xmm4 @@ -573,83 +574,85 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: packuswb %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: packuswb %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[0,3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: packuswb %xmm9, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm8[0,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: packuswb %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: packuswb %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: packuswb %xmm1, %xmm4 @@ -660,26 +663,26 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] @@ -687,21 +690,20 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] @@ -712,65 +714,66 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: packuswb %xmm2, %xmm8 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: packuswb %xmm2, %xmm7 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm8[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: packuswb %xmm7, %xmm11 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm9[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: packuswb %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm11[0,3] ; SSE-NEXT: movdqa %xmm6, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movaps %xmm13, (%rdx) ; SSE-NEXT: movaps %xmm3, 16(%rcx) ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm7, 16(%r8) +; SSE-NEXT: movaps %xmm8, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: addq $120, %rsp +; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf32: @@ -1024,87 +1027,87 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $632, %rsp # imm = 0x278 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm13 -; SSE-NEXT: movdqa 48(%rdi), %xmm7 -; SSE-NEXT: movdqa 128(%rdi), %xmm14 +; SSE-NEXT: subq $664, %rsp # imm = 0x298 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rdi), %xmm14 +; SSE-NEXT: movdqa 128(%rdi), %xmm15 ; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,0,255,0,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm1, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm1, %xmm5 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm0 @@ -1117,32 +1120,32 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: packuswb %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1153,112 +1156,114 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: packuswb %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: packuswb %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm0[0,3] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; SSE-NEXT: packuswb %xmm1, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[0,3] +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] @@ -1277,43 +1282,44 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm3[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm3[0,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] @@ -1333,7 +1339,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] @@ -1359,188 +1365,189 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,6,5,4] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm3 +; SSE-NEXT: packuswb %xmm6, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm7 ; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm7[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: packuswb %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm8[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: packuswb %xmm7, %xmm8 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm9[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: packuswb %xmm9, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm8[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: packuswb %xmm8, %xmm10 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: packuswb %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm11[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; SSE-NEXT: packuswb %xmm11, %xmm12 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: packuswb %xmm11, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm10[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm11 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: packuswb %xmm11, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm12[0,3] -; SSE-NEXT: movdqa %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm11[0,3] +; SSE-NEXT: movdqa %xmm5, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rsi) ; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps %xmm14, (%rdx) +; SSE-NEXT: movaps %xmm15, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps %xmm6, 48(%rcx) ; SSE-NEXT: movaps %xmm3, 32(%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps %xmm10, (%rcx) +; SSE-NEXT: movaps %xmm12, (%rcx) ; SSE-NEXT: movaps %xmm13, 48(%r8) ; SSE-NEXT: movaps %xmm8, 32(%r8) ; SSE-NEXT: movaps %xmm7, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: addq $632, %rsp # imm = 0x278 +; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 @@ -1690,104 +1697,107 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $216, %rsp +; AVX2-ONLY-NEXT: subq $168, %rsp ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm9 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm1, %ymm9 ; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm1, %ymm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1800,24 +1810,20 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm14 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm1, %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm15 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm7 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1827,15 +1833,15 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %xmm5, %xmm14 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1846,95 +1852,93 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm12, %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm1, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm1 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm10, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm14, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm1, %ymm6 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm2, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-ONLY-NEXT: addq $216, %rsp +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-ONLY-NEXT: addq $168, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index fec4b1b0511ce..1532859fb115a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -422,283 +422,276 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pxor %xmm8, %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm11, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3] -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm11, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm12[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm13, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm6[8],xmm13[9],xmm6[9],xmm13[10],xmm6[10],xmm13[11],xmm6[11],xmm13[12],xmm6[12],xmm13[13],xmm6[13],xmm13[14],xmm6[14],xmm13[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; SSE-NEXT: pand %xmm14, %xmm13 +; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: pand %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm14, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: packuswb %xmm11, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[3,0] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2] +; SSE-NEXT: pand %xmm15, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm12, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm0 +; SSE-NEXT: packuswb %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[3,0] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm13, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm10, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pandn %xmm15, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[2,0] -; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,0] +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[3,0] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: por %xmm11, %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm1[1,2] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm15, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por %xmm4, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm12, (%r8) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm10, (%rcx) +; SSE-NEXT: movdqa %xmm6, (%r8) ; SSE-NEXT: movaps %xmm4, (%r9) ; SSE-NEXT: retq ; @@ -958,156 +951,158 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: subq $184, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm6 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa 144(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] @@ -1118,85 +1113,84 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: psllq $48, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm11 -; SSE-NEXT: packuswb %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm1 +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 ; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,1,3] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] @@ -1204,158 +1198,162 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm11 +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm2[2,0] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm11[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] @@ -1364,68 +1362,69 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[3,0] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] ; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm11[3,0] -; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0,2] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] @@ -1435,77 +1434,77 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[3,1,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: packuswb %xmm1, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] -; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: packuswb %xmm1, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,1] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[1,2] +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[3,1,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1514,14 +1513,14 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm9, 16(%rcx) +; SSE-NEXT: movdqa %xmm12, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm8, 16(%r8) -; SSE-NEXT: movdqa %xmm4, (%r8) +; SSE-NEXT: movdqa %xmm9, 16(%r8) +; SSE-NEXT: movdqa %xmm8, (%r8) ; SSE-NEXT: movaps %xmm1, 16(%r9) ; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: addq $184, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf32: @@ -1541,13 +1540,13 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] @@ -1560,18 +1559,18 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm14, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -1580,26 +1579,26 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm8[u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -1607,14 +1606,14 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -1625,23 +1624,23 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm15 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm8[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -1654,14 +1653,14 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -2012,61 +2011,62 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i8_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $568, %rsp # imm = 0x238 +; SSE-NEXT: subq $552, %rsp # imm = 0x228 ; SSE-NEXT: movdqa 160(%rdi), %xmm9 ; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pxor %xmm12, %xmm12 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm9, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm9 ; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE-NEXT: pand %xmm8, %xmm9 ; SSE-NEXT: por %xmm2, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] @@ -2074,70 +2074,74 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] @@ -2145,13 +2149,13 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,3] @@ -2159,663 +2163,617 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: packuswb %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa 240(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm12, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa 144(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: packuswb %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm12, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,5,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,3] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm6 -; SSE-NEXT: packuswb %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm4 +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] +; SSE-NEXT: psllq $48, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: psllq $48, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4] +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm7 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,1,2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[0,2] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm2[2,0] +; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: pand %xmm8, %xmm13 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] @@ -2824,12 +2782,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] @@ -2838,8 +2796,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] @@ -2847,16 +2804,18 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] @@ -2867,102 +2826,140 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; SSE-NEXT: pxor %xmm10, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: pxor %xmm13, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[1,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm12, %xmm8 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm6, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15] -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm8 ; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] @@ -2970,142 +2967,143 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: packuswb %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: packuswb %xmm0, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1] -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[1,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm13[8],xmm10[9],xmm13[9],xmm10[10],xmm13[10],xmm10[11],xmm13[11],xmm10[12],xmm13[12],xmm10[13],xmm13[13],xmm10[14],xmm13[14],xmm10[15],xmm13[15] -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: packuswb %xmm1, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[1,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: packuswb %xmm1, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm1[1,2] -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm10[3,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm15 +; SSE-NEXT: por %xmm8, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: packuswb %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,1] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[1,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm13 +; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: packuswb %xmm0, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[1,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,3,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: packuswb %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -3122,441 +3120,435 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm14, 16(%r8) -; SSE-NEXT: movdqa %xmm6, 48(%r8) -; SSE-NEXT: movdqa %xmm15, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm4, 48(%r9) -; SSE-NEXT: movaps %xmm7, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movdqa %xmm9, 16(%r8) +; SSE-NEXT: movdqa %xmm10, 48(%r8) +; SSE-NEXT: movdqa %xmm11, (%r8) +; SSE-NEXT: movdqa %xmm14, 32(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps %xmm5, 48(%r9) +; SSE-NEXT: movaps %xmm8, (%r9) ; SSE-NEXT: movaps %xmm2, 32(%r9) -; SSE-NEXT: addq $568, %rsp # imm = 0x238 +; SSE-NEXT: addq $552, %rsp # imm = 0x228 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm4, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm8, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[2,7,12] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [9,14,0,128,128,128,128,4,9,14,0,128,128,128,128,4] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,0,5,10,15,128,128,128,0,0,5,10,15,128] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,11,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm8, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[2,7,12] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm12, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,0,1,6,11,128,128,128,128,0,1,6,11,128,128] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u],zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm12[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm8, %ymm13 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm4[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,2,7,12,0,0,128,128,128,2,7,12,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u],zero,zero,zero,xmm8[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[3,8,13],zero,zero,zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,2,7,12],zero,zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,8,13],zero,zero,zero,zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u],zero,zero,zero,xmm8[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[4,9,14],zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,1,6,11,0,0,128,128,128,1,6,11,0,0,128,128] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm4, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -3564,246 +3556,247 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i8_stride5_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $136, %rsp -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm12 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm10 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm15, %ymm4, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm6 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm12, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm12 -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm9, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm1 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm15, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm5 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm6 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm13, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm5, %ymm8, %ymm13 -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm7 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm15, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm0 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm14 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm15, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm7 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm11, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm7 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm3, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm5, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm4, %ymm15, %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm15 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm2 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7],ymm9[8,9,10,11,12],ymm4[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm7 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm7 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX2-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7],ymm13[8,9,10,11,12],ymm7[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm9 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm12 -; AVX2-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX2-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm8 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm12 +; AVX2-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm10 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm11 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm14 +; AVX2-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm15 +; AVX2-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX2-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm13, %ymm13 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm15 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm3 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm3 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3815,12 +3808,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 70548501cfe76..6d38dcbbcf074 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -97,73 +97,73 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: andps %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: andps %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movd %xmm1, (%rsi) -; SSE-NEXT: movd %xmm5, (%rdx) -; SSE-NEXT: movd %xmm6, (%rcx) -; SSE-NEXT: movd %xmm7, (%r8) -; SSE-NEXT: movd %xmm0, (%r9) +; SSE-NEXT: movd %xmm0, (%rsi) +; SSE-NEXT: movd %xmm4, (%rdx) +; SSE-NEXT: movd %xmm7, (%rcx) +; SSE-NEXT: movd %xmm6, (%r8) +; SSE-NEXT: movd %xmm1, (%r9) ; SSE-NEXT: movd %xmm2, (%rax) ; SSE-NEXT: retq ; @@ -470,81 +470,78 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm9 ; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm6, %xmm0 @@ -571,10 +568,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -587,10 +584,10 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm15, %xmm0 @@ -602,7 +599,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] @@ -611,13 +608,14 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm13 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] @@ -626,25 +624,25 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] ; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] @@ -657,114 +655,113 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pand %xmm12, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,3] -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3] +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movdqa %xmm11, (%rdx) -; SSE-NEXT: movdqa %xmm8, (%rcx) -; SSE-NEXT: movdqa %xmm4, (%r8) -; SSE-NEXT: movdqa %xmm7, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movdqa %xmm12, (%rdx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm10, (%r8) +; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, (%rax) ; SSE-NEXT: retq @@ -1080,450 +1077,457 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movdqa 64(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pand %xmm5, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa 176(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: packuswb %xmm15, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: psrld $16, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; SSE-NEXT: packuswb %xmm14, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: packuswb %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: packuswb %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm13[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm10, %xmm0 @@ -1531,191 +1535,196 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; SSE-NEXT: packuswb %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,1,0,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3] -; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 ; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm10 ; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: packuswb %xmm11, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: packuswb %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 ; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3] -; SSE-NEXT: psrlq $48, %xmm9 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm10 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] +; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: packuswb %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: packuswb %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movdqa %xmm14, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movdqa %xmm12, (%r8) -; SSE-NEXT: movdqa %xmm4, 16(%r9) -; SSE-NEXT: movdqa %xmm6, (%r9) +; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: movdqa %xmm8, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm8, (%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) ; SSE-NEXT: addq $280, %rsp # imm = 0x118 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $168, %rsp -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[4,10] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 @@ -1724,15 +1733,16 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,11] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 @@ -1743,17 +1753,17 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm3, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm10 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm8[5,11,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm10, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm10, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] @@ -1776,8 +1786,8 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[0,6,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 @@ -1785,89 +1795,89 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm8[1,7,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] ; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4,5],xmm6[6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm5, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm12[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[4,10,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm5[4,10,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 @@ -1877,71 +1887,71 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm5[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: addq $168, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1949,30 +1959,30 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i8_stride6_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm11 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm3[0,1] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm3 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm9, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] @@ -2007,12 +2017,12 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero ; AVX2-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] @@ -2022,34 +2032,34 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm3 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u],zero,zero,xmm2[0,6,12],zero,zero,zero,xmm2[4,10] +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,xmm2[1,7,13],zero,zero,zero,xmm2[5,11] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2289,248 +2299,253 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $824, %rsp # imm = 0x338 -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: subq $808, %rsp # imm = 0x328 +; SSE-NEXT: movdqa 64(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 336(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa 336(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 288(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 288(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 368(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 240(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa 192(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 272(%rdi), %xmm14 +; SSE-NEXT: movdqa 272(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 ; SSE-NEXT: movdqa 160(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2542,7 +2557,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2555,21 +2570,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: movdqa 96(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -2584,52 +2600,54 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm9, %xmm5 ; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm5, %xmm5 @@ -2637,196 +2655,195 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: packuswb %xmm14, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: packuswb %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: packuswb %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: packuswb %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: packuswb %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: packuswb %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: packuswb %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: packuswb %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: packuswb %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2837,42 +2854,43 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2880,42 +2898,42 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2923,42 +2941,43 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2966,587 +2985,587 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm10, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm8[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[3,0] -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: packuswb %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm14, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[2,3] -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: packuswb %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm8, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] ; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: packuswb %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm10, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: packuswb %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 ; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: packuswb %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3] +; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm9, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm13 -; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: packuswb %xmm15, %xmm14 -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm14, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm14, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm9 -; SSE-NEXT: por %xmm15, %xmm9 -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] ; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: packuswb %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3580,66 +3599,67 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movdqa %xmm7, 16(%r9) -; SSE-NEXT: movdqa %xmm6, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 32(%rax) +; SSE-NEXT: movdqa %xmm4, 16(%rax) +; SSE-NEXT: movdqa %xmm6, 32(%rax) ; SSE-NEXT: movdqa %xmm12, 48(%rax) -; SSE-NEXT: movdqa %xmm8, (%rax) -; SSE-NEXT: addq $824, %rsp # imm = 0x338 +; SSE-NEXT: movdqa %xmm2, (%rax) +; SSE-NEXT: addq $808, %rsp # imm = 0x328 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -3670,529 +3690,547 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] ; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] -; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm0[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[0,6,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[2,8,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm1[5,11,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm14[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,6,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[0,6,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm14[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[5,11],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm1[1,7,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[1,7,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4214,259 +4252,256 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride6_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm12, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm13 -; AVX2-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm9 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm15, %ymm9, %ymm14 -; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm10 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[0,1],ymm10[0,1] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm11, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm13, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm5 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm11, %ymm12, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm3, %ymm6, %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm9, %ymm8, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm4, %ymm7 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm5, %ymm7, %ymm14 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm3, %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm7 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm15 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm12, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm11, %ymm2, %ymm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm4 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm10 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-ONLY-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm6 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm9 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm14, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm1 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm11 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6 -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm14 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm1 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm13, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX2-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm3, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm8[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm11 +; AVX2-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm13, %ymm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm4 -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm6, %ymm5 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm11 +; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] +; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm2 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm2 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm8 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm11 -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7],ymm10[8,9,10],ymm4[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm8 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 @@ -4474,288 +4509,280 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rsi) ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i8_stride6_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $88, %rsp -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm29 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm10 -; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-NEXT: vpshufb %xmm7, %xmm11, %xmm3 -; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512F-NEXT: vmovdqa64 32(%rdi), %ymm30 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512F-NEXT: vmovdqa64 160(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm12 -; AVX512F-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm12 -; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-NEXT: subq $40, %rsp +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm25 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm0 +; AVX512F-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u> +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512F-NEXT: vpor %xmm3, %xmm6, %xmm9 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm30 +; AVX512F-NEXT: vmovdqa64 32(%rdi), %ymm31 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm29 +; AVX512F-NEXT: vmovdqa64 160(%rdi), %ymm18 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm6 +; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm6 +; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm12, %xmm9 -; AVX512F-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm28, %ymm14 -; AVX512F-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-NEXT: vpshufb %xmm7, %xmm15, %xmm1 -; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm5 +; AVX512F-NEXT: vpshufb %xmm3, %xmm7, %xmm12 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-NEXT: vpshufb %xmm8, %xmm6, %xmm13 +; AVX512F-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa %ymm11, %ymm9 +; AVX512F-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512F-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX512F-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX512F-NEXT: vporq %xmm1, %xmm5, %xmm17 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm17 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX512F-NEXT: vpor %xmm4, %xmm11, %xmm2 -; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm13, %xmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX512F-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512F-NEXT: vpor %xmm11, %xmm12, %xmm2 -; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX512F-NEXT: vpor %xmm3, %xmm10, %xmm2 -; AVX512F-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512F-NEXT: vporq %xmm1, %xmm0, %xmm26 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm13, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm10 -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512F-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm11 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-NEXT: vpshufb %xmm7, %xmm4, %xmm12 -; AVX512F-NEXT: vpor %xmm11, %xmm12, %xmm2 -; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa %ymm13, %ymm11 -; AVX512F-NEXT: vpternlogq $202, %ymm28, %ymm30, %ymm11 -; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm12 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3 -; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm1 -; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm1, %xmm15, %xmm8 -; AVX512F-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512F-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-NEXT: vpor %xmm8, %xmm10, %xmm1 -; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm22, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm8 +; AVX512F-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm21 = ymm0[2,3],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $1, 288(%rdi), %ymm0, %ymm22 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm15 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm15[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm10 = ymm17[2,3],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $1, 96(%rdi), %ymm17, %ymm25 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512F-NEXT: vpternlogq $202, %ymm10, %ymm25, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $248, %ymm2, %ymm5, %ymm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm17 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm15 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm6, %zmm15 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-NEXT: vpternlogq $184, %zmm15, %zmm5, %zmm26 -; AVX512F-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vporq %xmm4, %xmm6, %xmm28 +; AVX512F-NEXT: vpshufb %xmm8, %xmm9, %xmm4 +; AVX512F-NEXT: vpshufb %xmm12, %xmm13, %xmm6 +; AVX512F-NEXT: vporq %xmm4, %xmm6, %xmm21 +; AVX512F-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm27 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm15 +; AVX512F-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX512F-NEXT: vpor %xmm8, %xmm13, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512F-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512F-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512F-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm14 -; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX512F-NEXT: vpshufb %xmm7, %xmm9, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512F-NEXT: vpternlogq $226, %ymm31, %ymm16, %ymm4 -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX512F-NEXT: vpshufb %xmm8, %xmm11, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[4,10],zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[u,u,u,u,u,u] -; AVX512F-NEXT: vporq %xmm0, %xmm2, %xmm29 -; AVX512F-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-NEXT: vpternlogq $226, %ymm27, %ymm13, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512F-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX512F-NEXT: vporq %xmm3, %xmm7, %xmm20 -; AVX512F-NEXT: vpternlogq $202, %ymm28, %ymm30, %ymm16 -; AVX512F-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm25, %ymm10, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm13 -; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX512F-NEXT: vpshufb %xmm5, %xmm7, %xmm5 ; AVX512F-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX512F-NEXT: vporq %xmm5, %xmm6, %xmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm11, %xmm4, %xmm9 -; AVX512F-NEXT: vporq %xmm6, %xmm9, %xmm27 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX512F-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX512F-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm12 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpternlogq $236, %ymm18, %ymm3, %ymm14 -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm1 -; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm3, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-NEXT: vextracti32x4 $1, %ymm16, %xmm3 -; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm4 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[4,10],zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm4, %xmm8, %xmm4 -; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX512F-NEXT: vpshufb %xmm15, %xmm13, %xmm8 -; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $226, %ymm25, %ymm8, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm8 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $242, %ymm5, %ymm13, %ymm9 -; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5 -; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm9 -; AVX512F-NEXT: vinserti32x4 $2, %xmm29, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $226, %zmm9, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $242, %ymm4, %ymm13, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm4 +; AVX512F-NEXT: vporq %xmm0, %xmm6, %xmm16 +; AVX512F-NEXT: vmovdqa %ymm11, %ymm10 +; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm24, %ymm10 +; AVX512F-NEXT: vpshufb %xmm7, %xmm10, %xmm8 +; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX512F-NEXT: vpshufb %xmm12, %xmm7, %xmm12 +; AVX512F-NEXT: vpor %xmm8, %xmm12, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm12, %xmm15, %xmm15 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512F-NEXT: vpor %xmm4, %xmm15, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] +; AVX512F-NEXT: vpor %xmm1, %xmm15, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 +; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512F-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 +; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload +; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512F-NEXT: vpshufb %xmm12, %xmm14, %xmm0 +; AVX512F-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm21 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,u,u,u,1,7,13],zero,zero,zero,xmm10[5,11],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] +; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm28 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-NEXT: vpternlogq $226, %ymm26, %ymm11, %ymm7 +; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm14, %xmm7, %xmm2 +; AVX512F-NEXT: vporq %xmm1, %xmm2, %xmm26 +; AVX512F-NEXT: vmovdqa64 %ymm18, %ymm10 +; AVX512F-NEXT: vpternlogq $226, %ymm29, %ymm9, %ymm10 +; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512F-NEXT: vporq %xmm2, %xmm4, %xmm27 +; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm11 +; AVX512F-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm22, %ymm9 +; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512F-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512F-NEXT: vporq %xmm1, %xmm2, %xmm24 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa64 %xmm1, %xmm29 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm13, %xmm7, %xmm1 +; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm18 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-NEXT: vpshufb %xmm2, %xmm8, %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX512F-NEXT: vpor %xmm1, %xmm8, %xmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $236, %ymm22, %ymm3, %ymm16 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpternlogq $236, %ymm22, %ymm4, %ymm21 +; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpternlogq $248, %ymm22, %ymm4, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 +; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512F-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX512F-NEXT: vpshufb %xmm14, %xmm11, %xmm14 +; AVX512F-NEXT: vpor %xmm5, %xmm14, %xmm5 +; AVX512F-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 +; AVX512F-NEXT: vpternlogq $248, %ymm22, %ymm9, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm9 +; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $242, %ymm9, %ymm12, %ymm10 +; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm9 +; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm10 +; AVX512F-NEXT: vinserti32x4 $2, %xmm26, %zmm10, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $226, %zmm10, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqa64 %xmm29, %xmm5 +; AVX512F-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm27, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm3 -; AVX512F-NEXT: vpternlogq $184, %zmm3, %zmm9, %zmm4 -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm12 -; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm2 -; AVX512F-NEXT: vpternlogq $184, %zmm14, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq $242, %ymm0, %ymm12, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2 +; AVX512F-NEXT: vinserti32x4 $2, %xmm18, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm14, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm0 +; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload +; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 +; AVX512F-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm3, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512F-NEXT: addq $88, %rsp +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-NEXT: addq $40, %rsp ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 0cd7ba03c66cd..44459e1756009 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -20,50 +20,50 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: psrlq $48, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movd %xmm3, %edi +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movd %xmm1, %edi ; SSE-NEXT: movw %di, (%rsi) ; SSE-NEXT: movd %xmm4, %esi ; SSE-NEXT: movw %si, (%rdx) -; SSE-NEXT: movd %xmm6, %edx +; SSE-NEXT: movd %xmm5, %edx ; SSE-NEXT: movw %dx, (%rcx) -; SSE-NEXT: movd %xmm5, %ecx +; SSE-NEXT: movd %xmm6, %ecx ; SSE-NEXT: movw %cx, (%r8) ; SSE-NEXT: movd %xmm7, %ecx ; SSE-NEXT: movw %cx, (%r9) ; SSE-NEXT: movd %xmm0, %ecx ; SSE-NEXT: movw %cx, (%r10) -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movw %cx, (%rax) ; SSE-NEXT: retq ; @@ -108,67 +108,67 @@ define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pand %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: por %xmm5, %xmm15 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,1,0,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,0,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: por %xmm14, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] @@ -176,39 +176,39 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 ; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,0,2,3] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,0,2,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3],xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movd %xmm2, (%rsi) -; SSE-NEXT: movd %xmm3, (%rdx) -; SSE-NEXT: movd %xmm6, (%rcx) -; SSE-NEXT: movd %xmm4, (%r8) -; SSE-NEXT: movd %xmm5, (%r9) +; SSE-NEXT: movd %xmm5, (%rdx) +; SSE-NEXT: movd %xmm3, (%rcx) +; SSE-NEXT: movd %xmm6, (%r8) +; SSE-NEXT: movd %xmm4, (%r9) ; SSE-NEXT: movd %xmm8, (%rdi) ; SSE-NEXT: movd %xmm0, (%rax) ; SSE-NEXT: retq @@ -389,102 +389,108 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm6 ; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pxor %xmm13, %xmm13 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm0[0],xmm9[1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm0, %xmm12 ; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[2,3] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3] +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm9, %xmm13 +; SSE-NEXT: por %xmm7, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 @@ -495,15 +501,15 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 @@ -511,108 +517,114 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: movq %xmm5, (%rsi) -; SSE-NEXT: movq %xmm8, (%rdx) -; SSE-NEXT: movq %xmm9, (%rcx) -; SSE-NEXT: movq %xmm11, (%r8) -; SSE-NEXT: movq %xmm0, (%r9) -; SSE-NEXT: movq %xmm1, (%rdi) -; SSE-NEXT: movq %xmm2, (%rax) -; SSE-NEXT: retq -; -; AVX1-ONLY-LABEL: load_i8_stride7_vf8: -; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u] +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: movq %xmm13, (%rsi) +; SSE-NEXT: movq %xmm9, (%rdx) +; SSE-NEXT: movq %xmm8, (%rcx) +; SSE-NEXT: movq %xmm6, (%r8) +; SSE-NEXT: movq %xmm10, (%r9) +; SSE-NEXT: movq %xmm11, (%rdi) +; SSE-NEXT: movq %xmm0, (%rax) +; SSE-NEXT: retq +; +; AVX1-ONLY-LABEL: load_i8_stride7_vf8: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2] @@ -858,522 +870,534 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $184, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm14 -; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: subq $168, %rsp +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 64(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm15, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm13, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pand %xmm14, %xmm7 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pslld $16, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm12, %xmm14 +; SSE-NEXT: por %xmm8, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pslld $16, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, (%rsp) # 16-byte Spill -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE-NEXT: packuswb %xmm14, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: packuswb %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] ; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm9, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: packuswb %xmm2, %xmm10 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: packuswb %xmm0, %xmm9 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: andps %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] +; SSE-NEXT: pand %xmm13, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,0,3] +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: por %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; SSE-NEXT: andps %xmm5, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm5 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,1,0,3] +; SSE-NEXT: packuswb %xmm7, %xmm4 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: por %xmm5, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm3, (%r8) -; SSE-NEXT: movdqa %xmm0, (%r9) +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: andps %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm8, (%rcx) +; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movdqa %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm14, (%rax) -; SSE-NEXT: addq $184, %rsp +; SSE-NEXT: movdqa %xmm5, (%rax) +; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] @@ -1384,8 +1408,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] @@ -1398,13 +1422,13 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vpxor %xmm12, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[3,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[3,10] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u,u,u,u,u] @@ -1414,11 +1438,11 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[4,11] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[4,11] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm10, %xmm13, %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[u,u,u,u,u,u,u] @@ -1428,12 +1452,12 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[5,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] @@ -1443,10 +1467,10 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[6,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[6,13] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u] @@ -1459,12 +1483,12 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[0,7,14] ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm14, %xmm8 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 @@ -1473,13 +1497,13 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[1,8,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[1,8,15] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] ; AVX1-ONLY-NEXT: vpblendw $31, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4],xmm1[5,6,7] @@ -1807,29 +1831,28 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $632, %rsp # imm = 0x278 -; SSE-NEXT: movdqa 208(%rdi), %xmm9 -; SSE-NEXT: movdqa 192(%rdi), %xmm6 -; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: subq $648, %rsp # imm = 0x288 +; SSE-NEXT: movdqa 208(%rdi), %xmm14 +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm6 ; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm14, %xmm14 +; SSE-NEXT: pxor %xmm10, %xmm10 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -1838,20 +1861,25 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] @@ -1862,55 +1890,55 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -1919,19 +1947,22 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] @@ -1941,125 +1972,130 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm12 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm7 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 @@ -2067,785 +2103,789 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pand %xmm15, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pand %xmm13, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm14, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pslld $16, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm15[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pand %xmm15, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE-NEXT: pand %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSE-NEXT: packuswb %xmm13, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm15, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,3] +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm15, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3],xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm12, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: packuswb %xmm1, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm12, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: packuswb %xmm8, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,2,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: packuswb %xmm1, %xmm14 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm5[0],xmm6[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,0,3] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,2,1] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm11, %xmm4 -; SSE-NEXT: andps %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm6[0],xmm12[1,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm5[0],xmm8[1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm13, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm14, %xmm6 -; SSE-NEXT: andps %xmm10, %xmm12 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm12, %xmm5 +; SSE-NEXT: andps %xmm0, %xmm8 +; SSE-NEXT: por %xmm8, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm12[0],xmm11[1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm6 +; SSE-NEXT: andps %xmm3, %xmm8 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] +; SSE-NEXT: packuswb %xmm8, %xmm10 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm12, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm12, %xmm5 -; SSE-NEXT: andps %xmm9, %xmm11 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: andps %xmm3, %xmm10 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm11, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; SSE-NEXT: pxor %xmm12, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] ; SSE-NEXT: packuswb %xmm11, %xmm10 -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm8[0],xmm10[1,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm8, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm15[8],xmm12[9],xmm15[9],xmm12[10],xmm15[10],xmm12[11],xmm15[11],xmm12[12],xmm15[12],xmm12[13],xmm15[13],xmm12[14],xmm15[14],xmm12[15],xmm15[15] -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm12 -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm12, %xmm8 -; SSE-NEXT: andps %xmm9, %xmm10 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: pxor %xmm14, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[3,3,3,3] -; SSE-NEXT: packuswb %xmm12, %xmm11 -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm10[0],xmm11[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: pandn %xmm12, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: andps %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: andps %xmm1, %xmm10 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm10, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2854,329 +2894,326 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movdqa %xmm4, (%r9) -; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: movdqa %xmm14, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm5, (%rax) -; SSE-NEXT: movdqa %xmm6, 16(%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, (%rax) +; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: movdqa %xmm8, 16(%rax) -; SSE-NEXT: addq $632, %rsp # imm = 0x278 +; SSE-NEXT: addq $648, %rsp # imm = 0x288 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $200, %rsp -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm5[u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm6[u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,5,12],zero,zero,xmm8[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u],zero,zero,xmm8[3,10,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,5,12],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,u,6,13],zero,zero,xmm8[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[4,11,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,6,13],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[0,7,14,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,5,12],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm10[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[1,8,15,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u],zero,zero,xmm9[4,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,6,13],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u],zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,6,13],zero,zero,xmm5[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,3,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u],zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm3, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm4, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; AVX1-ONLY-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] -; AVX1-ONLY-NEXT: vpxor %xmm14, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[3,10] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[4,11] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps (%rsp), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[5,12] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm7[6,13] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[2,9,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[0,7,14] -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,xmm8[3,10,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[1,8,15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[6,13] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[0,7,14] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[1,8,15] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: addq $200, %rsp @@ -3185,643 +3222,637 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-SLOW-LABEL: load_i8_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $104, %rsp -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX2-SLOW-NEXT: subq $72, %rsp +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm14, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7,8,9],ymm10[10],ymm9[11,12,13],ymm10[14],ymm9[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX2-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] +; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] +; AVX2-SLOW-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm14, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8,9,10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4,5,6],ymm15[7,8],ymm1[9,10],ymm15[11],ymm1[12,13,14],ymm15[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4],ymm9[5,6],ymm6[7,8],ymm9[9,10,11],ymm6[12],ymm9[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-SLOW-NEXT: addq $104, %rsp +; AVX2-SLOW-NEXT: addq $72, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i8_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: pushq %rax -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: subq $40, %rsp +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,2,4,6,1,2,4,6] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,6,1,2,4,6] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4,5],ymm9[6],ymm1[7,8,9],ymm9[10],ymm1[11,12,13],ymm9[14],ymm1[15] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm7, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,3,4,6,1,3,4,6] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,12] +; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12] ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14] -; AVX2-FAST-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-FAST-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14] +; AVX2-FAST-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm12, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15] -; AVX2-FAST-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15] +; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm11[1,2],ymm7[3],ymm11[4,5,6],ymm7[7,8],ymm11[9,10],ymm7[11],ymm11[12,13,14],ymm7[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3],ymm11[4],ymm9[5,6],ymm11[7,8],ymm9[9,10,11],ymm11[12],ymm9[13,14],ymm11[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1,2],ymm4[3],ymm13[4,5,6],ymm4[7,8],ymm13[9,10],ymm4[11],ymm13[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7,8],ymm9[9],ymm2[10,11,12],ymm9[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,3,5,6,1,3,5,6] ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm14[1,2,3,4,5,6,7],ymm7[8],ymm14[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm12[1,2,3,4,5,6,7],ymm4[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm15[1,2,3,4,5,6,7],ymm5[8],ymm15[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: popq %rax +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7,8,9],ymm10[10],ymm9[11,12,13],ymm10[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm14, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8,9,10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4,5,6],ymm15[7,8],ymm1[9,10],ymm15[11],ymm1[12,13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4],ymm9[5,6],ymm6[7,8],ymm9[9,10,11],ymm6[12],ymm9[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4629,26 +4660,26 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i8_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1512, %rsp # imm = 0x5E8 -; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: subq $1528, %rsp # imm = 0x5F8 +; SSE-NEXT: movdqa 208(%rdi), %xmm12 ; SSE-NEXT: movdqa 192(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -4661,35 +4692,35 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,0,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] @@ -4698,36 +4729,35 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] @@ -4739,37 +4769,37 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa 288(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa 304(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] @@ -4779,32 +4809,32 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa 320(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 384(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] @@ -4816,37 +4846,38 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 336(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa 400(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 416(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: movdqa 416(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] @@ -4856,32 +4887,31 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: movdqa 432(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] @@ -4893,283 +4923,283 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm7, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm14, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: pxor %xmm5, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 @@ -5179,19 +5209,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm12 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -5199,522 +5229,531 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload ; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm15, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm11, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm11, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: pandn %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pslld $16, %xmm14 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pslld $16, %xmm13 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm8 ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSE-NEXT: packuswb %xmm9, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: pandn %xmm3, %xmm15 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm15, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] +; SSE-NEXT: pand %xmm14, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: packuswb %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: packuswb %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm15, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm2, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] +; SSE-NEXT: packuswb %xmm2, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 @@ -5722,282 +5761,284 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm13 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] ; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,1,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,2,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6006,699 +6047,709 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm11, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm15[8],xmm7[9],xmm15[9],xmm7[10],xmm15[10],xmm7[11],xmm15[11],xmm7[12],xmm15[12],xmm7[13],xmm15[13],xmm7[14],xmm15[14],xmm7[15],xmm15[15] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE-NEXT: pxor %xmm12, %xmm12 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm15[8],xmm7[9],xmm15[9],xmm7[10],xmm15[10],xmm7[11],xmm15[11],xmm7[12],xmm15[12],xmm7[13],xmm15[13],xmm7[14],xmm15[14],xmm7[15],xmm15[15] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm2, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm15[8],xmm7[9],xmm15[9],xmm7[10],xmm15[10],xmm7[11],xmm15[11],xmm7[12],xmm15[12],xmm7[13],xmm15[13],xmm7[14],xmm15[14],xmm7[15],xmm15[15] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm15[8],xmm7[9],xmm15[9],xmm7[10],xmm15[10],xmm7[11],xmm15[11],xmm7[12],xmm15[12],xmm7[13],xmm15[13],xmm7[14],xmm15[14],xmm7[15],xmm15[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm12[0],xmm8[1,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm8[0],xmm9[1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm7 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm15[8],xmm12[9],xmm15[9],xmm12[10],xmm15[10],xmm12[11],xmm15[11],xmm12[12],xmm15[12],xmm12[13],xmm15[13],xmm12[14],xmm15[14],xmm12[15],xmm15[15] -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,3] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm13, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,1,0,3] +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,3,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm8 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm13, %xmm7 +; SSE-NEXT: andps %xmm5, %xmm9 +; SSE-NEXT: por %xmm9, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm15[8],xmm12[9],xmm15[9],xmm12[10],xmm15[10],xmm12[11],xmm15[11],xmm12[12],xmm15[12],xmm12[13],xmm15[13],xmm12[14],xmm15[14],xmm12[15],xmm15[15] -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm9, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] ; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: por %xmm13, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm15[0],xmm8[1,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pand %xmm6, %xmm15 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: andps %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,5,4,7,6] +; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm9[0],xmm15[1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pand %xmm14, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pxor %xmm15, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: por %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: andps %xmm5, %xmm15 +; SSE-NEXT: por %xmm15, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: packuswb %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: packuswb %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm8, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm14, %xmm11 +; SSE-NEXT: andps %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] +; SSE-NEXT: packuswb %xmm8, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,1,1] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,1,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm10 -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: andps %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: packuswb %xmm0, %xmm14 +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm8, %xmm14 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: andps %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[3,3,3,3] +; SSE-NEXT: packuswb %xmm12, %xmm8 ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: andps %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm13, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: por %xmm13, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,3,3,3] +; SSE-NEXT: packuswb %xmm13, %xmm8 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,1,1] +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm10 -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: andps %xmm7, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[3,3,3,3] -; SSE-NEXT: packuswb %xmm10, %xmm3 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: andps %xmm4, %xmm8 +; SSE-NEXT: por %xmm8, %xmm13 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: por %xmm14, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm14, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[3,3,3,3] -; SSE-NEXT: packuswb %xmm13, %xmm3 -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm13, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,1,0,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE-NEXT: pxor %xmm15, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,7,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,3,3,3] +; SSE-NEXT: packuswb %xmm8, %xmm14 +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm1[0],xmm14[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] ; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: andps %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: andps %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: movdqa %xmm12, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rax) +; SSE-NEXT: movdqa %xmm11, (%rax) +; SSE-NEXT: movdqa %xmm9, 48(%rax) +; SSE-NEXT: movdqa %xmm7, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: movdqa %xmm10, 48(%rax) -; SSE-NEXT: movdqa %xmm1, 32(%rax) -; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: addq $1512, %rsp # imm = 0x5E8 +; SSE-NEXT: movdqa %xmm1, (%rax) +; SSE-NEXT: movdqa %xmm13, 48(%rax) +; SSE-NEXT: movdqa %xmm12, 32(%rax) +; SSE-NEXT: movdqa %xmm3, 16(%rax) +; SSE-NEXT: addq $1528, %rsp # imm = 0x5F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf64: @@ -6710,7 +6761,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0] @@ -6752,738 +6803,777 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm2, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm5, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm15, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm3, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm3, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm13, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm12 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm6, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm13 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm10, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm5, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm12 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm12, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm5, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm4, %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm4[1,2],xmm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm9, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; AVX1-ONLY-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpxor %xmm11, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm15[3,10] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vxorps %xmm7, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm10[6,13] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [12,0,0,0,128,128,128,5,12,0,0,0,128,128,128,5] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,0,7,14,128,128,0,0,0,0,7,14,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm15[6,13] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,xmm6[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,1,8,15,128,128,0,0,0,1,8,15,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm13 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u],zero,zero,xmm13[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm14[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm12[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7511,17 +7601,17 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: subq $760, %rsp # imm = 0x2F8 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -7529,9 +7619,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> @@ -7553,242 +7643,240 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm12, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7,8,9],ymm2[10],ymm1[11,12,13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm6, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7,8,9],ymm4[10],ymm2[11,12,13],ymm4[14],ymm2[15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm12, %ymm15, %ymm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm10 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm11 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm10, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm8 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm15, %ymm8 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm14, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm7 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm11 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm14, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm9 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm15, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -7796,244 +7884,243 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm7 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm11 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm9 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm15, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm15 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm15, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3],ymm6[4,5,6],ymm3[7,8],ymm6[9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1,2,3],ymm4[4],ymm10[5,6],ymm4[7,8],ymm10[9,10,11],ymm4[12],ymm10[13,14],ymm4[15] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3,4,5,6,7],ymm10[8],ymm6[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm7[0],mem[1,2,3,4,5,6,7],ymm7[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -8042,38 +8129,37 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-SLOW-NEXT: addq $760, %rsp # imm = 0x2F8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i8_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-NEXT: subq $776, %rsp # imm = 0x308 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> @@ -8083,239 +8169,250 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7,8,9],ymm8[10],ymm3[11,12,13],ymm8[14],ymm3[15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,2,0,2,1,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,0,2,1,3,4,6] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm14, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] ; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm12 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm12, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm8, %xmm10 -; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm13, %ymm15, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm11, %xmm6 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm13 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm14 -; AVX2-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm13 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm14, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm14, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm5 -; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -8323,37 +8420,40 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm14, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 @@ -8361,192 +8461,198 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8,9,10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm10, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm10, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8,9,10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3],ymm1[4,5,6],ymm12[7,8],ymm1[9,10],ymm12[11],ymm1[12,13,14],ymm12[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5,6],ymm1[7,8],ymm2[9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1,2,3],ymm1[4],ymm5[5,6],ymm1[7,8],ymm5[9,10,11],ymm1[12],ymm5[13,14],ymm1[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1,2,3],ymm2[4],ymm7[5,6],ymm2[7,8],ymm7[9,10,11],ymm2[12],ymm7[13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5,6],ymm1[7,8],ymm12[9,10],ymm1[11],ymm12[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4],ymm14[5,6],ymm3[7,8],ymm14[9,10,11],ymm3[12],ymm14[13,14],ymm3[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1,2,3],ymm1[4],ymm15[5,6],ymm1[7,8],ymm15[9,10,11],ymm1[12],ymm15[13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm7 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7,8],ymm8[9],ymm13[10,11,12],ymm8[13],ymm13[14,15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7,8],ymm8[9],ymm11[10,11,12],ymm8[13],ymm11[14,15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [1,3,1,2,1,3,5,6] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm8[1,2,3,4,5,6,7],ymm6[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm8[1,2,3,4,5,6,7],ymm3[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, (%rsp), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm7[0],mem[1,2,3,4,5,6,7],ymm7[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -8555,28 +8661,28 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) -; AVX2-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-NEXT: addq $776, %rsp # imm = 0x308 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: subq $760, %rsp # imm = 0x2F8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm4 @@ -8584,9 +8690,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> @@ -8608,242 +8714,240 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7,8,9],ymm2[10],ymm1[11,12,13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm6, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7,8,9],ymm4[10],ymm2[11,12,13],ymm4[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm12, %ymm15, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm10, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm14, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm14, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm15, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -8851,244 +8955,243 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm15, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3],ymm6[4,5,6],ymm3[7,8],ymm6[9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1,2,3],ymm4[4],ymm10[5,6],ymm4[7,8],ymm10[9,10,11],ymm4[12],ymm10[13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm9, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3,4,5,6,7],ymm10[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm7[0],mem[1,2,3,4,5,6,7],ymm7[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -9097,2061 +9200,2077 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $760, %rsp # imm = 0x2F8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: pushq %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm24, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: subq $72, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4,5],ymm5[6],ymm2[7,8,9],ymm5[10],ymm2[11,12,13],ymm5[14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm4, %xmm9, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7,8,9],ymm13[10],ymm10[11,12,13],ymm13[14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm9, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm13, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm24, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7,8,9],ymm13[10],ymm3[11,12,13],ymm13[14],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm22, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm24, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm30, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7,8,9],ymm0[10],ymm7[11,12,13],ymm0[14],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm13[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8,9,10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6],ymm6[7,8],ymm5[9,10],ymm6[11],ymm5[12,13,14],ymm6[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm24, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5,6],ymm0[7,8],ymm3[9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8,9,10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm27, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm13[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm14[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4,5,6],ymm13[7,8],ymm10[9,10],ymm13[11],ymm10[12,13,14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6],ymm6[7,8],ymm5[9,10,11],ymm6[12],ymm5[13,14],ymm6[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm29, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm9, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm9, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm29, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm29, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm19, %ymm12, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2,3],ymm11[4],ymm2[5,6],ymm11[7,8],ymm2[9,10,11],ymm11[12],ymm2[13,14],ymm11[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm26, %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[2,9,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3],ymm13[4],ymm10[5,6],ymm13[7,8],ymm10[9,10,11],ymm13[12],ymm10[13,14],ymm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm27, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm3, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6,7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm28, %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm7, %ymm3, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm27, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7,8],ymm10[9],ymm7[10,11,12],ymm10[13],ymm7[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm7, %ymm3, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm12, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[6,13] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm24, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm20, %ymm1, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm24, %ymm19, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm0, %ymm2, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7,8],ymm11[9],ymm8[10,11],ymm11[12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm5, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm0, %ymm5, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm17, %ymm1, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm27, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm27, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7,8],ymm8[9],ymm1[10,11,12],ymm8[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm18, %ymm9, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm17, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm15, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm23, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm27, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm23, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-SLOW-NEXT: popq %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $72, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i8_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: pushq %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 32(%rdi), %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm20, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm25, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 160(%rdi), %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm26, %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 288(%rdi), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,6,1,2,4,6] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 288(%rdi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm22, %ymm13, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4,5],ymm2[6],ymm14[7,8,9],ymm2[10],ymm14[11,12,13],ymm2[14],ymm14[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm4, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 416(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7,8,9],ymm7[10],ymm15[11,12,13],ymm7[14],ymm15[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 416(%rdi), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm7, %ymm22, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm20, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm25, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7,8,9],ymm3[10],ymm4[11,12,13],ymm3[14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm26, %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm22, %ymm13, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4,5],ymm9[6],ymm5[7,8,9],ymm9[10],ymm5[11,12,13],ymm9[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm5, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm7, %ymm22, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm29, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8,9,10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,5,6,1,3,5,6] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2],ymm7[3],ymm5[4,5,6],ymm7[7,8],ymm5[9,10],ymm7[11],ymm5[12,13,14],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm20, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm25, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm5, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm2, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm26, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm22, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5,6],ymm5[7,8],ymm4[9,10],ymm5[11],ymm4[12,13,14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm4, %ymm27, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm20, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm25, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5,6],ymm3[7,8],ymm2[9,10],ymm3[11],ymm2[12,13,14],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm6, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm26, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm9[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm28, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm22, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3],ymm7[4],ymm6[5,6],ymm7[7,8],ymm6[9,10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm2, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm6, %ymm27, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm29, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm16, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %xmm4, %xmm2, %xmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm22, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm4, %ymm27, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm22, %ymm13, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm4, %ymm27, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm26, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm16, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 208(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512F-ONLY-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm11, %ymm25, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm17, %ymm12, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm25, %ymm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm25, %ymm11, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2,3],ymm11[4],ymm2[5,6],ymm11[7,8],ymm2[9,10,11],ymm11[12],ymm2[13,14],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm3, %ymm29, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6,7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm3, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm28, %ymm18, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm8, %ymm22, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6,7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm18, %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm12, %ymm10, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm18, %ymm11, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm10, %ymm12, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3],ymm10[4],ymm8[5,6],ymm10[7,8],ymm8[9,10,11],ymm10[12],ymm8[13,14],ymm10[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm1, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm3, %ymm25, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6,7,8],ymm10[9],ymm14[10,11],ymm10[12],ymm14[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7,8],ymm13[9],ymm11[10,11,12],ymm13[13],ymm11[14,15] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm3, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm26, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm22, %ymm14, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm26, %ymm31, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm16, %ymm15, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7,8,9],ymm5[10],ymm13[11,12],ymm5[13],ymm13[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm28, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm26, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: popq %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm8, %ymm26, %ymm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: subq $104, %rsp +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm13 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm13[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4,5],ymm10[6],ymm13[7,8,9],ymm10[10],ymm13[11,12,13],ymm10[14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm9, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm1[2],ymm4[3,4,5],ymm1[6],ymm4[7,8,9],ymm1[10],ymm4[11,12,13],ymm1[14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 288(%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7,8,9],ymm4[10],ymm2[11,12,13],ymm4[14],ymm2[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm10 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8,9,10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm8, %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7,8,9],ymm1[10],ymm5[11,12,13],ymm1[14],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[1,8,15],zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm24, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm8, %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm10, %xmm0 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm12[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm5 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3],ymm5[4,5,6],ymm2[7,8],ymm5[9,10],ymm2[11],ymm5[12,13,14],ymm2[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm13 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm13[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm13[1,2],ymm6[3],ymm13[4,5,6],ymm6[7,8],ymm13[9,10],ymm6[11],ymm13[12,13,14],ymm6[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm25 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm8, %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm6, %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5,6],ymm1[7,8],ymm2[9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm23, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,12] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm1 -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm14, %ymm19, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2],ymm11[3],ymm4[4,5,6],ymm11[7,8],ymm4[9,10],ymm11[11],ymm4[12,13,14],ymm11[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm23, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm13, %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm12[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm20, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm6 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3],ymm10[4],ymm6[5,6],ymm10[7,8],ymm6[9,10,11],ymm10[12],ymm6[13,14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm4, %ymm6 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm24, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm6, %ymm25, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm15, %ymm17, %ymm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm27, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm5, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm25, %ymm23 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm0 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm17, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm17, %ymm15, %ymm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm24, %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm4, %ymm25, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vporq %xmm2, %xmm0, %xmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm8, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm11, %ymm18, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm19, %ymm9, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm18, %ymm11, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6],ymm1[7,8],ymm2[9,10,11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm23, %ymm5 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6,7,8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7,8],ymm1[9],ymm9[10,11,12],ymm1[13],ymm9[14,15] -; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm16, %ymm31, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm17, %ymm10, %ymm15 -; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm16, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] -; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vporq %xmm4, %xmm0, %xmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm19, %ymm14, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm20, %ymm15, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3],ymm11[4],ymm10[5,6],ymm11[7,8],ymm10[9,10,11],ymm11[12],ymm10[13,14],ymm11[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm2, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm10 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm2, %ymm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] -; AVX512DQ-SLOW-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm23, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm10, %ymm19, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5,6,7,8],ymm11[9],ymm6[10,11],ymm11[12],ymm6[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm1, %ymm12, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7,8],ymm11[9],ymm15[10,11,12],ymm11[13],ymm15[14,15] +; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm2, %ymm6, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm18, %ymm9, %ymm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm19, %ymm4 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] +; AVX512DQ-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm19, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm1, %zmm4 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm27, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm30, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm15[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm29, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm23, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm17, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u] +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm16[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm28, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-SLOW-NEXT: addq $104, %rsp ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i8_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: pushq %rax -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm8, %ymm16, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm24, %ymm11, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7,8,9],ymm10[10],ymm2[11,12],ymm10[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm30 ; AVX512DQ-FAST-NEXT: vmovdqa64 160(%rdi), %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm25, %ymm31, %ymm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,6,1,2,4,6] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,6,1,2,4,6] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512DQ-FAST-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm3, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm21, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 288(%rdi), %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm13, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 352(%rdi), %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm13, %ymm14 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm14[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4,5],ymm3[6],ymm14[7,8,9],ymm3[10],ymm14[11,12,13],ymm3[14],ymm14[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 288(%rdi), %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm4, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm15 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7,8,9],ymm7[10],ymm15[11,12,13],ymm7[14],ymm15[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa64 416(%rdi), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm3, %ymm22, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm20, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm8, %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa64 416(%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm8 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm7, %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm24, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4,5],ymm7[6],ymm3[7,8,9],ymm7[10],ymm3[11,12,13],ymm7[14],ymm3[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm25, %ymm31, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm13, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4,5],ymm10[6],ymm6[7,8,9],ymm10[10],ymm6[11,12,13],ymm10[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,4,6,1,3,4,6] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm7 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm6, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm7, %ymm23, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm28, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8,9,10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm8, %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm24, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm25, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,5,6,1,3,5,6] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm29, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6],ymm6[7,8],ymm5[9,10],ymm6[11],ymm5[12,13,14],ymm6[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5,6],ymm7[7,8],ymm6[9,10],ymm7[11],ymm6[12,13,14],ymm7[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm30, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm8, %ymm16, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm24, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5,6],ymm7[7,8],ymm3[9,10],ymm7[11],ymm3[12,13,14],ymm7[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm26, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm9[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm10, %ymm17, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm29, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3],ymm7[4],ymm6[5,6],ymm7[7,8],ymm6[9,10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm5 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm22, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm14, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm3[1,2],ymm10[3],ymm3[4,5,6],ymm10[7,8],ymm3[9,10],ymm10[11],ymm3[12,13,14],ymm10[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm2, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm29, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm2 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm27 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm10, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm13, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm18, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm5, %ymm22, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm25, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %xmm5, %xmm2, %xmm22 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm19, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm8, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm8, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm11, %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm17, %ymm12, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm24, %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm24, %ymm11, %ymm12 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 208(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[5,12] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm0[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm8, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm29, %ymm18, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm4, %ymm16, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm27, %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm27 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm18, %ymm29, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm16, %ymm4, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7,8],ymm8[9],ymm6[10,11,12],ymm8[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm28 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm23, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %xmm6, %xmm3, %xmm23 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm13, %ymm14, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm9, %xmm13, %xmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm12, %ymm17, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm18, %ymm11, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm13 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm17, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6],ymm0[7,8],ymm2[9,10,11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm11, %ymm26, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6],ymm0[7,8],ymm8[9,10,11],ymm0[12],ymm8[13,14],ymm0[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm8, %ymm1, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7,8],ymm0[9],ymm13[10,11],ymm0[12],ymm13[13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm5, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7,8],ymm0[9],ymm11[10,11,12],ymm0[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm26, %ymm3, %ymm9 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm25, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm29, %ymm14, %ymm13 -; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm25, %ymm31, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] +; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm16, %ymm15, %ymm4 +; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14] +; AVX512DQ-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm1, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15] +; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm26, %ymm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm6, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm27, %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm30, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7,8,9],ymm5[10],ymm13[11,12],ymm5[13],ymm13[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm27, %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm28, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm29, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[2,3,0,1] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,22,29,20,27,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm22, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-FAST-NEXT: popq %rax ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k2 ; AVX512BW-ONLY-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm1, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512BW-ONLY-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm3 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm10, %zmm25 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 ; AVX512BW-ONLY-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm10 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm10, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm10 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm14[2],ymm10[3,4,5],ymm14[6],ymm10[7,8,9],ymm14[10],ymm10[11,12,13],ymm14[14],ymm10[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm10, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512BW-ONLY-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm20 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 ; AVX512BW-ONLY-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm22 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm22, %ymm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm22 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm22, %ymm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm1, %xmm22, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm22, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm17, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm9, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm21[0],xmm17[0],xmm21[1],xmm17[1],xmm21[2],xmm17[2],xmm21[3],xmm17[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm17, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm14, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm21, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k2 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6],ymm3[7,8],ymm1[9,10],ymm3[11],ymm1[12,13,14],ymm3[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6],ymm3[7,8],ymm1[9,10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm1, %xmm0, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm21 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm16, %ymm15 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7,8,9],ymm1[10],ymm15[11,12],ymm1[13],ymm15[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm14, %ymm10 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm6, %ymm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm8, %ymm5 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-ONLY-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm11 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm6[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm26, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm12 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm26, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512BW-ONLY-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512BW-ONLY-FAST-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k3 ; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -11165,286 +11284,284 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k7 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm12 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm12[u,u,u,u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm12, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k6 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm10, %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $3968, %ax # imm = 0xF80 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm10, %ymm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512BW-ONLY-FAST-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm20 {%k6} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 ; AVX512BW-ONLY-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm20 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm20, %ymm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm20 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] ; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm14, %zmm13 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm14 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm15, %xmm14, %xmm15 ; AVX512BW-ONLY-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k5 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] ; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm14, %xmm19, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm14 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 ; AVX512BW-ONLY-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm18, %ymm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm18, %zmm14, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm14[6,13],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm14 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] ; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm16, %xmm16 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm14 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm21 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm9, %ymm3 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm11, %ymm8 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm21 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm11, %ymm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm21, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm21, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm22, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm11 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm8, %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm11, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm8, %xmm8 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm11, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm11, %ymm0, %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm19, %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm19, %ymm8 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm20, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm11, %ymm21 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm9, %zmm0 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm11, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm11, %zmm9 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm0, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm12, %zmm11 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm1, %zmm3 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] @@ -11453,396 +11570,396 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-ONLY-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm1, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rdi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512DQBW-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm3 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k2 ; AVX512DQBW-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm1, %xmm25 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512DQBW-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k6 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm3 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 ; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm10, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm25 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 ; AVX512DQBW-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm10 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm10, %xmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm10 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm14[2],ymm10[3,4,5],ymm14[6],ymm10[7,8,9],ymm14[10],ymm10[11,12,13],ymm14[14],ymm10[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k7 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm10, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512DQBW-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k4 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} ; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 ; AVX512DQBW-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm22 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4} ; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm22, %ymm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm22 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm22, %ymm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm1, %xmm22, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k5} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm22, %xmm14 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k5} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 ; AVX512DQBW-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k5 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm17, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm9, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm21[0],xmm17[0],xmm21[1],xmm17[1],xmm21[2],xmm17[2],xmm21[3],xmm17[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm17, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm14, %xmm14 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm21, %xmm14 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 ; AVX512DQBW-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k2 ; AVX512DQBW-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6],ymm3[7,8],ymm1[9,10],ymm3[11],ymm1[12,13,14],ymm3[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6],ymm3[7,8],ymm1[9,10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm1, %xmm0, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k7 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm22 {%k6} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm21 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm16, %ymm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k6} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm1, %ymm21 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQBW-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm14, %ymm10 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm8, %ymm5 {%k4} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm1, %ymm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm11 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQBW-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm11, %xmm6 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> +; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm12 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} ; AVX512DQBW-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm3 {%k5} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm6[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm26, %xmm2 +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm26, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k5} +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7],ymm12[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%rdi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rdi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq @@ -11850,43 +11967,43 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQBW-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} ; AVX512DQBW-FAST-NEXT: kmovq %k1, %k2 ; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512DQBW-FAST-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} ; AVX512DQBW-FAST-NEXT: kmovq %k1, %k3 ; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -11900,312 +12017,313 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm7 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 ; AVX512DQBW-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQBW-FAST-NEXT: kmovq %rax, %k5 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512DQBW-FAST-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k6 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm12 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm12[u,u,u,u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm12, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm10, %xmm21 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k7 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm21 {%k7} -; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} +; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512DQBW-FAST-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k4 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm20 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4} ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQBW-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 ; AVX512DQBW-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm20 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm20, %ymm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm20 {%k6} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] ; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 ; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm14, %zmm13 {%k5} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm14 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQBW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm15 ; AVX512DQBW-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm15 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm20 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm20 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] ; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm14, %xmm19, %xmm14 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm14 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQBW-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 ; AVX512DQBW-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 ; AVX512DQBW-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm18, %ymm14 {%k2} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,xmm5[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm18, %zmm14, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k3 ; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQBW-FAST-NEXT: kmovq %rax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] +; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm14[6,13],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11] -; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm14 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm16 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] ; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm14, %zmm16 -; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm16, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm16, %xmm16 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm14 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm21 {%k4} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm17 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm9, %ymm3 {%k6} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm11, %ymm8 {%k4} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm21 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm9 {%k3} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm13, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm0 {%k3} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm8, %xmm8 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm13, %ymm0 {%k2} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] ; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm13 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm11, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm20, %zmm1 -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm11 +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm21, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm20 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm5[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm9, %zmm20 {%k5} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm9 {%k5} +; AVX512DQBW-FAST-NEXT: vporq %xmm2, %xmm21, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm8, %xmm11 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm11 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm11 {%k5} ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm7, %xmm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} +; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm3, %zmm3 ; AVX512DQBW-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQBW-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm1, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm3, %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rdi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rdi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 9c6b03d69af65..3ca2aae97def0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -381,11 +381,11 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: pushq %rax ; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa 48(%rdi), %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 @@ -399,58 +399,58 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] ; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] ; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,2,2,3] +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -475,51 +475,51 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3] +; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm5, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm11, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: packuswb %xmm14, %xmm14 ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] ; SSE-NEXT: packuswb %xmm9, %xmm9 ; SSE-NEXT: pand %xmm3, %xmm9 @@ -528,42 +528,42 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] ; SSE-NEXT: packuswb %xmm13, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,7,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm2 @@ -576,8 +576,8 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlps %xmm0, (%rdx) ; SSE-NEXT: movq %xmm8, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) -; SSE-NEXT: movq %xmm12, (%r9) +; SSE-NEXT: movq %xmm15, (%r8) +; SSE-NEXT: movq %xmm11, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq %xmm9, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -816,13 +816,12 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $328, %rsp # imm = 0x148 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 32(%rdi), %xmm13 ; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 96(%rdi), %xmm12 ; SSE-NEXT: movdqa 112(%rdi), %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0] @@ -832,141 +831,145 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm3 ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: packuswb %xmm11, %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: pandn %xmm1, %xmm9 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm5, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm15 ; SSE-NEXT: por %xmm9, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm9 ; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 @@ -977,7 +980,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,5] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm5, %xmm15 ; SSE-NEXT: pandn %xmm9, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm2 @@ -985,7 +988,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm15, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] @@ -1011,14 +1014,14 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm15, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] @@ -1032,12 +1035,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1049,22 +1051,23 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 @@ -1075,14 +1078,14 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm1, %xmm8 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1103,12 +1106,12 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: # xmm12 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: pandn %xmm1, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm9, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] @@ -1134,7 +1137,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: pandn %xmm9, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -1142,7 +1145,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,1,3] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm14, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] @@ -1161,21 +1164,21 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm9, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: packuswb %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm15 ; SSE-NEXT: pandn %xmm9, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,1,4,5,6,7] @@ -1190,35 +1193,35 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1230,10 +1233,10 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -1245,10 +1248,10 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1263,7 +1266,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm15, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, (%rax) +; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, (%rax) ; SSE-NEXT: addq $328, %rsp # imm = 0x148 @@ -1961,20 +1964,19 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] @@ -2044,7 +2046,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2072,7 +2074,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2153,19 +2155,19 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm15 @@ -2225,9 +2227,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: pand %xmm11, %xmm7 ; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 @@ -2245,9 +2247,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 @@ -2430,9 +2432,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: movdqa %xmm9, %xmm5 @@ -2447,9 +2449,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 @@ -2504,9 +2506,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -2610,8 +2612,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: packuswb %xmm1, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,2] +; SSE-NEXT: packuswb %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,2] ; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2671,7 +2673,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshuflw $116, (%rsp), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: movdqa %xmm9, %xmm2 @@ -2684,7 +2686,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $116, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 @@ -2747,7 +2749,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -2813,7 +2815,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movapd %xmm7, (%r9) +; SSE-NEXT: movapd %xmm10, (%r9) ; SSE-NEXT: movapd %xmm6, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm4, (%rax) @@ -3263,14 +3265,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-LABEL: load_i8_stride8_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3283,8 +3284,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -3294,21 +3294,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm11 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 @@ -3326,10 +3327,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -3338,8 +3338,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -3348,28 +3349,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3381,7 +3380,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3390,10 +3389,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 @@ -3402,34 +3401,35 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -3440,47 +3440,49 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3491,47 +3493,46 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3545,18 +3546,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -3566,22 +3569,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3595,43 +3597,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3648,38 +3650,38 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -3710,184 +3712,193 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FAST-LABEL: load_i8_stride8_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $232, %rsp +; AVX2-FAST-NEXT: subq $248, %rsp ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm8 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm10 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm15 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm11 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] @@ -3898,29 +3909,29 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] @@ -3928,25 +3939,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -3957,26 +3968,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] @@ -3984,7 +3995,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3993,30 +4004,29 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-FAST-NEXT: addq $232, %rsp +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $248, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride8_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4029,8 +4039,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 @@ -4040,21 +4049,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm12 @@ -4072,10 +4082,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -4084,8 +4093,9 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4094,28 +4104,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -4127,7 +4135,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4136,10 +4144,10 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3 @@ -4148,34 +4156,35 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -4186,47 +4195,49 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4237,47 +4248,46 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4291,18 +4301,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -4312,22 +4324,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4341,43 +4352,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4394,38 +4405,38 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4460,91 +4471,91 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpmovqb %ymm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpmovqb %ymm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] ; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm6 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm19 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm20 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm10 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm9 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4552,41 +4563,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4594,43 +4607,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4638,41 +4649,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm7 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm25 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm24 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm2 @@ -4680,45 +4691,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm2 @@ -4726,43 +4735,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm25 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm2 @@ -4770,11 +4777,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -4782,24 +4790,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm16, %zmm3 @@ -5588,7 +5594,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2024, %rsp # imm = 0x7E8 +; SSE-NEXT: subq $2040, %rsp # imm = 0x7F8 ; SSE-NEXT: movdqa 64(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm8 @@ -5599,9 +5605,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 144(%rdi), %xmm10 ; SSE-NEXT: movdqa 160(%rdi), %xmm7 ; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm15 ; SSE-NEXT: movdqa 224(%rdi), %xmm9 ; SSE-NEXT: movdqa 240(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] @@ -5611,8 +5618,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -5650,8 +5658,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 48(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm8 @@ -5674,10 +5682,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa 448(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm0 @@ -5739,20 +5747,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm2 @@ -5820,14 +5827,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE-NEXT: packuswb %xmm15, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: packuswb %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm9, %xmm2 @@ -5849,12 +5856,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -5883,15 +5890,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -6020,7 +6027,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6031,7 +6038,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6065,19 +6072,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6093,7 +6100,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] @@ -6101,56 +6108,55 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; SSE-NEXT: packuswb %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm15 ; SSE-NEXT: por %xmm5, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm5 @@ -6337,26 +6343,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm5, %xmm5 ; SSE-NEXT: pand %xmm12, %xmm5 ; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: movdqa %xmm9, %xmm15 ; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm7 ; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6415,7 +6421,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: movdqa %xmm9, %xmm15 ; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -6550,7 +6556,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 @@ -6560,15 +6566,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 @@ -6576,7 +6582,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -6740,14 +6747,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] @@ -6872,15 +6879,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -7019,9 +7026,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -7063,11 +7071,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -7358,8 +7365,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: pand %xmm12, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 @@ -7449,7 +7455,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm4, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm15, 32(%rax) -; SSE-NEXT: movapd %xmm6, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -7464,7 +7471,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movapd %xmm1, 32(%rax) ; SSE-NEXT: movapd %xmm2, 16(%rax) ; SSE-NEXT: movapd %xmm3, (%rax) -; SSE-NEXT: addq $2024, %rsp # imm = 0x7E8 +; SSE-NEXT: addq $2040, %rsp # imm = 0x7F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf64: @@ -7629,15 +7636,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -7730,11 +7737,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm6 @@ -7861,15 +7868,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -7962,13 +7969,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8062,13 +8069,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8329,72 +8336,73 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-LABEL: load_i8_stride8_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348 -; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -8402,118 +8410,117 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8521,100 +8528,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8623,100 +8631,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8724,104 +8733,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8829,103 +8834,106 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8933,101 +8941,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -9036,101 +9043,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -9138,50 +9144,51 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) @@ -9231,20 +9238,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm15 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm14 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 @@ -9269,19 +9275,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] @@ -9291,7 +9294,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm9 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9306,10 +9309,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 @@ -9330,7 +9333,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9340,55 +9343,53 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 @@ -9396,12 +9397,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -9421,57 +9423,55 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm15 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -9479,49 +9479,49 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 @@ -9529,62 +9529,61 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3] @@ -9608,25 +9607,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm5[2,3] @@ -9650,59 +9650,59 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] @@ -9726,61 +9726,64 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] @@ -9801,62 +9804,64 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm15 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -9900,72 +9905,73 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-LABEL: load_i8_stride8_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $840, %rsp # imm = 0x348 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -9973,118 +9979,117 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10092,100 +10097,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10194,100 +10200,101 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10295,104 +10302,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10400,103 +10403,106 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10504,101 +10510,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10607,101 +10612,100 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -10709,50 +10713,51 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) @@ -10793,23 +10798,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm27 +; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm17 ; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] @@ -10821,19 +10828,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm19 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpmovqb %zmm17, %xmm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: movb $-64, %al @@ -10841,18 +10849,18 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512F-SLOW-NEXT: vpmovqb %zmm5, %xmm5 @@ -10865,397 +10873,397 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm28 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-SLOW-NEXT: vpmovqb %zmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm27, %zmm7 -; AVX512F-SLOW-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm17, %zmm6 +; AVX512F-SLOW-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm19 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm18 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm21, %zmm2 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm28 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm16 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm18 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11264,98 +11272,102 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm27 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm30 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm18, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm23 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -11363,86 +11375,88 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm17, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm26, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 @@ -11451,73 +11465,72 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm23, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm26, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11529,307 +11542,298 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, (%rax) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i8_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $408, %rsp # imm = 0x198 +; AVX512F-FAST-NEXT: subq $440, %rsp # imm = 0x1B8 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %ymm31 -; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-FAST-NEXT: vpmovqb %zmm26, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 ; AVX512F-FAST-NEXT: movb $-64, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm7 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512F-FAST-NEXT: vpmovqb %zmm29, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm0 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm5 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512F-FAST-NEXT: vpsrlq $8, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $8, %zmm28, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm15 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm5 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrlq $8, %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $8, %zmm29, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm26, %zmm5 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm28, %zmm5 ; AVX512F-FAST-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm29 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm5 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm16 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm20 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm28, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm29, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpsrlq $24, %zmm26, %zmm5 +; AVX512F-FAST-NEXT: vpsrlq $24, %zmm28, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 ; AVX512F-FAST-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm21 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm16 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrlq $24, %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $24, %zmm29, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -11837,254 +11841,259 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm18 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm24 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm22 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm16 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm26, %zmm10 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm20 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm22 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm24, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 ; AVX512F-FAST-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1} ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm15 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm24 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm23, %zmm11 -; AVX512F-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm29, %zmm12 +; AVX512F-FAST-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512F-FAST-NEXT: vpsrlq $40, %zmm26, %zmm14 +; AVX512F-FAST-NEXT: vpsrlq $40, %zmm25, %zmm14 ; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm21 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm23 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $40, %zmm19, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $40, %zmm29, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 ; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm31 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm16 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm24 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm26, %zmm14 +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm25, %zmm14 ; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm19, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm18, %zmm13 ; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm17 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm6 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm12 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX512F-FAST-NEXT: vpsrlq $56, %zmm26, %zmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX512F-FAST-NEXT: vpsrlq $56, %zmm25, %zmm11 ; AVX512F-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrlq $56, %zmm19, %zmm2 -; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512F-FAST-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -12096,674 +12105,679 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-FAST-NEXT: addq $408, %rsp # imm = 0x198 +; AVX512F-FAST-NEXT: addq $440, %rsp # imm = 0x1B8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm4 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 ; AVX512BW-SLOW-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm6 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm5, %xmm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm4, %xmm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512BW-SLOW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm27 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm12 ; AVX512BW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm1, %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm20 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX512BW-SLOW-NEXT: vpmovqb %zmm1, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm20 ; AVX512BW-SLOW-NEXT: movb $-64, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm28 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm28, %xmm7 ; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm1, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm1, %xmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm18[0],xmm13[0],xmm18[1],xmm13[1],xmm18[2],xmm13[2],xmm18[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 208(%rdi), %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm10, %xmm10 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm10 ; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm18[0],xmm15[1],xmm18[1],xmm15[2],xmm18[2],xmm15[3],xmm18[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3] ; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm21 ; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa 416(%rdi), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm16 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm25, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 416(%rdi), %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm29 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm30, %xmm25 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm16, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, %xmm22 -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm13, %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm18 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm20, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm23 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm12, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm28 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm4, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm3, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm25 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm31, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm24 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm6, %zmm4 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm20 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm31 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 176(%rdi), %xmm25 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm31, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm27, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm22, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm31 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm15, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm28, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm17, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm24 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm25, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm0, %xmm24 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm0, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm6, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm8, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm6, %xmm30 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm23, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm15, %zmm2 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm5, %zmm3 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm19, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm20 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm23 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm22 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm29, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm28, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm18, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm26[0],xmm0[0],xmm26[1],xmm0[1],xmm26[2],xmm0[2],xmm26[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm21, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm29 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm13, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm27, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm22, %zmm13 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm21, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm31, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm23, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 ; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm15, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm5, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm25, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm6, %zmm13 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm21, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm22 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm23, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm23 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm28, %zmm2 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm28 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm10, %xmm30 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm2 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm13, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm10, %zmm13 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm27, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm9, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm20 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm31, %xmm30 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm30[0],xmm3[1],xmm30[1],xmm3[2],xmm30[2],xmm3[3],xmm30[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm13, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm27, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $32, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3] +; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm24, %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm26[0],xmm6[0],xmm26[1],xmm6[1],xmm26[2],xmm6[2],xmm26[3],xmm6[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm13, %zmm6 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm26[0],xmm6[0],xmm26[1],xmm6[1],xmm26[2],xmm6[2],xmm26[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm17 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm29, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm27, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm31, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm14, %zmm2 +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm20, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm31 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm19 +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm30, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm13, %zmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm30, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm10, %zmm5 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm18 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm27, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm31, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm20, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm14, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm19, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm23, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm30, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm13, %zmm7 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm24, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm30, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm10, %zmm6 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm17, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm17, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm7 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm6 ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm29, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm27, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm31, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm20, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm20, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm23, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm14, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm19, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -12792,128 +12806,129 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-LABEL: load_i8_stride8_vf64: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: subq $328, %rsp # imm = 0x148 -; AVX512BW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm18 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm30 ; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm30, %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512BW-FAST-NEXT: vmovdqa %ymm2, %ymm11 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm31 ; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm31, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm17, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm19, %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm28 -; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm0, %ymm14 +; AVX512BW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm29 +; AVX512BW-FAST-NEXT: vpermd %ymm29, %ymm0, %ymm14 ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm3 ; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm20 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm20, %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 352(%rdi), %xmm19 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm19, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm21, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa 352(%rdi), %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm24 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FAST-NEXT: vmovdqa64 336(%rdi), %xmm18 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm18, %xmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm29 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm29, %xmm6 +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-FAST-NEXT: vmovdqa 336(%rdi), %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm28 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm28, %xmm6 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-FAST-NEXT: vpmovqb %zmm4, %xmm5 +; AVX512BW-FAST-NEXT: vpmovqb %zmm18, %xmm5 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm21 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 ; AVX512BW-FAST-NEXT: movb $-64, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm16 -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm16, %ymm1 +; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm17, %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm5 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm6 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm6 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm9 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm16 +; AVX512BW-FAST-NEXT: vpshufb %ymm10, %ymm16, %ymm8 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 112(%rdi), %xmm25 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm25, %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 96(%rdi), %xmm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm23, %xmm7 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512BW-FAST-NEXT: vmovdqa64 112(%rdi), %xmm26 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm26, %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 96(%rdi), %xmm24 +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm24, %xmm7 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512BW-FAST-NEXT: vmovdqa64 80(%rdi), %xmm22 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm22, %xmm26 -; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm9, %xmm24 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm24[0],xmm26[0],xmm24[1],xmm26[1],xmm24[2],xmm26[2],xmm24[3],xmm26[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm25 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm8, %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3] ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 ; AVX512BW-FAST-NEXT: vpmovqb %zmm10, %xmm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm30, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm31, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm30, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa %ymm9, %ymm11 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm31, %ymm13 +; AVX512BW-FAST-NEXT: vmovdqa %ymm9, %ymm6 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm17, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm15 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm19, %ymm13 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm24 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm24[0],xmm15[0],xmm24[1],xmm15[1],xmm24[2],xmm15[2],xmm24[3],xmm15[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm24 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm18, %xmm26 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm29, %xmm21 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm21[0],xmm26[0],xmm21[1],xmm26[1],xmm21[2],xmm26[2],xmm21[3],xmm26[3] +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm21, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-FAST-NEXT: vmovdqa %xmm12, %xmm7 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm25 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm28, %xmm20 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] -; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm18, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm16, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm11, %ymm17, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm15 -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm16, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm25, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm23, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm26, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm24, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm22, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm9, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm8, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] ; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm10, %zmm13 @@ -12922,95 +12937,97 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm30, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm31, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm17, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm8 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm12 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm19, %ymm11 +; AVX512BW-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa %xmm4, %xmm1 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm18, %xmm26 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm18, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm21 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm21[0],xmm15[0],xmm21[1],xmm15[1],xmm21[2],xmm15[2],xmm21[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm7, %xmm23 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm28, %xmm20 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm18, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm17, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm3, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm15 -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm25, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm30, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm31, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm17, %ymm8 +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm19, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm20, %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm24 -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm19, %xmm12 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm12 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm18, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm29, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm14 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm28, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm4, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm18, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm25 ; AVX512BW-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm17, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm16, %ymm1 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm25, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm23, %xmm2 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm26, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm24, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm22, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm10, %zmm2 @@ -13020,183 +13037,184 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd (%rsp), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm3, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm2 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vpermd %ymm29, %ymm3, %ymm14 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm11 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm16 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm16 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm17 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm18, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm29, %xmm28 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm23, %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm28, %xmm29 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm17, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm18, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm12 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm12 {%k1} ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm8 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm9 ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm18, %ymm15 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5,6],ymm8[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm18, %ymm15 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm3, %ymm20 +; AVX512BW-FAST-NEXT: vpermd %ymm27, %ymm3, %ymm21 ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm20, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm21, %ymm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm25, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm22, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm9, %xmm13 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm22, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm8, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm8 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm9 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm11 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm11 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm16, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm17, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm26, %xmm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm27 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm23, %xmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm28, %xmm27 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm17, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm25, %zmm12 ; AVX512BW-FAST-NEXT: vpmovqb %zmm12, %xmm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm19, %ymm8 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm18, %ymm12 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm20, %ymm4 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm19, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm18, %ymm12 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm21, %ymm4 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm25, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm12 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $40, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm21 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm3 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm5, %ymm4 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5],ymm8[6,7] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm4 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm4 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm14, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm16, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm24, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm16, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm17, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm29, %xmm13 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm23, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm28, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] ; AVX512BW-FAST-NEXT: vpsrlq $48, %zmm27, %zmm12 ; AVX512BW-FAST-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1} ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm19, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm18, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm18, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm20, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm21, %ymm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm25, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm23, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm26, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm24, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm22, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] ; AVX512BW-FAST-NEXT: vpsrlq $48, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm8, %zmm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm0 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm0 -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm2, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm5, %ymm1 ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm16, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm17, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm26, %xmm8 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm29, %xmm11 -; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3] -; AVX512BW-FAST-NEXT: vpsrlq $56, %zmm27, %zmm8 -; AVX512BW-FAST-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm23, %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm28, %xmm11 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX512BW-FAST-NEXT: vpsrlq $56, %zmm27, %zmm9 +; AVX512BW-FAST-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm19, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm18, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm15, %ymm8 -; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm20, %ymm11 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm23, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm18, %ymm9 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm15, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm21, %ymm11 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm26, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm1 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX512BW-FAST-NEXT: vpsrlq $56, %zmm10, %zmm4 @@ -13212,9 +13230,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovaps %zmm1, (%rcx) ; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FAST-NEXT: vmovaps %zmm1, (%r8) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, (%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, (%r9) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 8f160e2bafda0..3cc97d69e8b03 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1042,75 +1042,75 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm7, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm5, 128(%rcx) +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 160(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1119,75 +1119,75 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1308,18 +1308,17 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 +; SSE-NEXT: subq $328, %rsp # imm = 0x148 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm9 ; SSE-NEXT: movdqa 32(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1327,7 +1326,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] @@ -1336,7 +1335,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 @@ -1348,225 +1347,223 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 32(%rsi), %xmm8 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm15 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 64(%rsi), %xmm12 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa 64(%rsi), %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa 80(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa 80(%rsi), %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 96(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: movdqa 96(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm13 ; SSE-NEXT: movdqa 112(%rdx), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa 112(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 112(%rsi), %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; SSE-NEXT: pandn %xmm1, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm1 @@ -1575,71 +1572,71 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $250, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, 368(%rcx) -; SSE-NEXT: movdqa %xmm2, 320(%rcx) +; SSE-NEXT: movdqa %xmm3, 320(%rcx) ; SSE-NEXT: movdqa %xmm1, 272(%rcx) -; SSE-NEXT: movdqa %xmm12, 224(%rcx) -; SSE-NEXT: movdqa %xmm15, 176(%rcx) -; SSE-NEXT: movdqa %xmm3, 128(%rcx) -; SSE-NEXT: movdqa %xmm4, 80(%rcx) -; SSE-NEXT: movdqa %xmm5, 32(%rcx) +; SSE-NEXT: movdqa %xmm14, 224(%rcx) +; SSE-NEXT: movdqa %xmm2, 176(%rcx) +; SSE-NEXT: movdqa %xmm4, 128(%rcx) +; SSE-NEXT: movdqa %xmm5, 80(%rcx) +; SSE-NEXT: movdqa %xmm6, 32(%rcx) ; SSE-NEXT: movdqa %xmm11, 352(%rcx) -; SSE-NEXT: movdqa %xmm7, 336(%rcx) -; SSE-NEXT: movdqa %xmm9, 304(%rcx) -; SSE-NEXT: movdqa %xmm13, 288(%rcx) -; SSE-NEXT: movdqa %xmm14, 256(%rcx) +; SSE-NEXT: movdqa %xmm10, 336(%rcx) +; SSE-NEXT: movdqa %xmm13, 304(%rcx) +; SSE-NEXT: movdqa %xmm15, 288(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1662,27 +1659,27 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: addq $328, %rsp # imm = 0x148 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm14 @@ -1701,38 +1698,38 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 @@ -1740,7 +1737,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 @@ -1749,7 +1746,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 @@ -1761,104 +1758,107 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3],xmm15[4],xmm1[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm4[2],xmm8[3,4],xmm4[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2],xmm4[3,4],xmm14[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm10[2],xmm14[3,4],xmm10[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3,4],xmm11[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 288(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 368(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 288(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 368(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 320(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 336(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 336(%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1867,8 +1867,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rcx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 240(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1889,30 +1888,30 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rcx) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118 ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <5,5,u,6,6,u,7,7> ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] @@ -1923,8 +1922,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm10 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] @@ -1935,8 +1934,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 @@ -1948,7 +1947,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[3,3,3,3,4,5,6,7] @@ -1959,7 +1958,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 @@ -1977,58 +1976,58 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2],xmm4[3,4],xmm10[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3,4],xmm14[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm12, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm13, %ymm13 ; AVX2-SLOW-NEXT: vpermd 64(%rdi), %ymm12, %ymm15 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm15 ; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm12, %ymm6 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm2 ; AVX2-SLOW-NEXT: vpermd 96(%rdi), %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,3,3,u,4,4,u> -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm3 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 320(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 224(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 288(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 288(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) @@ -2044,137 +2043,138 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride3_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm7, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm3, 320(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 128(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 288(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 352(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 96(%rcx) +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm4, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 320(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 352(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm7, 160(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) @@ -2187,137 +2187,138 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm5, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 352(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 320(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 352(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index bc25bb39f9691..6cd1f13398c60 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -1355,12 +1355,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] ; SSE-NEXT: movdqa %xmm13, %xmm0 @@ -1369,98 +1370,97 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm0 -; SSE-NEXT: movdqa 64(%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa 64(%rsi), %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa 64(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movdqa 80(%rdx), %xmm0 -; SSE-NEXT: movdqa 80(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa 80(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa 80(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 96(%rdx), %xmm1 -; SSE-NEXT: movdqa 96(%rcx), %xmm4 +; SSE-NEXT: movdqa 96(%rcx), %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa 96(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rsi), %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movdqa 112(%rdx), %xmm2 -; SSE-NEXT: movdqa 112(%rcx), %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: movdqa 112(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa 112(%rsi), %xmm8 +; SSE-NEXT: movdqa 112(%rsi), %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, 496(%r8) -; SSE-NEXT: movdqa %xmm7, 480(%r8) +; SSE-NEXT: movdqa %xmm4, 480(%r8) ; SSE-NEXT: movdqa %xmm1, 464(%r8) -; SSE-NEXT: movdqa %xmm4, 448(%r8) -; SSE-NEXT: movdqa %xmm3, 432(%r8) -; SSE-NEXT: movdqa %xmm11, 416(%r8) -; SSE-NEXT: movdqa %xmm6, 400(%r8) -; SSE-NEXT: movdqa %xmm13, 384(%r8) -; SSE-NEXT: movdqa %xmm5, 368(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%r8) -; SSE-NEXT: movdqa %xmm15, 336(%r8) +; SSE-NEXT: movdqa %xmm3, 448(%r8) +; SSE-NEXT: movdqa %xmm5, 432(%r8) +; SSE-NEXT: movdqa %xmm10, 416(%r8) +; SSE-NEXT: movdqa %xmm9, 400(%r8) +; SSE-NEXT: movdqa %xmm12, 384(%r8) +; SSE-NEXT: movdqa %xmm11, 368(%r8) +; SSE-NEXT: movdqa %xmm15, 352(%r8) +; SSE-NEXT: movdqa %xmm8, 336(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r8) -; SSE-NEXT: movdqa %xmm10, 304(%r8) +; SSE-NEXT: movdqa %xmm13, 304(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r8) -; SSE-NEXT: movdqa %xmm12, 272(%r8) +; SSE-NEXT: movdqa %xmm14, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movdqa %xmm14, 208(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 92acf21cad010..2670cc353aa17 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -174,20 +174,21 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] ; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: por %xmm4, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; SSE-NEXT: pandn %xmm6, %xmm4 ; SSE-NEXT: por %xmm7, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] @@ -200,14 +201,14 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pand %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movq %xmm3, 32(%r9) ; SSE-NEXT: movdqa %xmm2, (%r9) ; SSE-NEXT: movdqa %xmm4, 16(%r9) @@ -410,18 +411,18 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm5 ; SSE-NEXT: movdqa (%rsi), %xmm7 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa (%rcx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,0] @@ -432,18 +433,18 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm10, %xmm11 ; SSE-NEXT: por %xmm9, %xmm11 ; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 ; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 ; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] @@ -454,8 +455,8 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] ; SSE-NEXT: pandn %xmm8, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; SSE-NEXT: psrlq $48, %xmm7 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] @@ -472,15 +473,15 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -493,10 +494,10 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm8, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, 16(%r9) -; SSE-NEXT: movdqa %xmm4, 48(%r9) +; SSE-NEXT: movdqa %xmm5, 48(%r9) ; SSE-NEXT: movdqa %xmm1, 64(%r9) ; SSE-NEXT: movdqa %xmm0, (%r9) -; SSE-NEXT: movdqa %xmm5, 32(%r9) +; SSE-NEXT: movdqa %xmm4, 32(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf8: @@ -799,161 +800,157 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm15 -; SSE-NEXT: movdqa 16(%rsi), %xmm13 +; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm10 ; SSE-NEXT: movdqa (%rcx), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rcx), %xmm11 -; SSE-NEXT: movdqa 16(%r8), %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa 16(%r8), %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: por %xmm6, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm15[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm7, %xmm13 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm9, %xmm14 ; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,2,2] ; SSE-NEXT: pand %xmm9, %xmm15 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: movdqa (%r8), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm12, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm13[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm15 -; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm14[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm15, %xmm12 +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm11, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 ; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] -; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm2, %xmm9 ; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -962,10 +959,10 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] @@ -979,11 +976,11 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm10, %xmm1 ; SSE-NEXT: movdqa %xmm1, (%r9) -; SSE-NEXT: movdqa %xmm12, 16(%r9) +; SSE-NEXT: movdqa %xmm13, 16(%r9) ; SSE-NEXT: movdqa %xmm15, 48(%r9) ; SSE-NEXT: movdqa %xmm9, 64(%r9) ; SSE-NEXT: movdqa %xmm7, 80(%r9) -; SSE-NEXT: movdqa %xmm13, 96(%r9) +; SSE-NEXT: movdqa %xmm12, 96(%r9) ; SSE-NEXT: movdqa %xmm14, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) @@ -995,45 +992,45 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm2[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2,3,4],xmm7[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm14 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 @@ -1043,79 +1040,79 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1],xmm13[2],xmm10[3,4,5,6],xmm13[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm13[2],xmm8[3,4,5,6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm9 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm11[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm11[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5],xmm13[6],xmm11[7] +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm12[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm3[2],xmm6[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm11[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 96(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-ONLY-NEXT: vzeroupper @@ -1562,147 +1559,151 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $232, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa 16(%rsi), %xmm13 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm0 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: subq $248, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm11, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm7 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: movdqa 16(%rdx), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm11 ; SSE-NEXT: por %xmm11, %xmm7 ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: movdqa 16(%r8), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movdqa 16(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: movdqa 32(%rsi), %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm7 ; SSE-NEXT: movdqa 32(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,2,2] -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: movdqa 32(%rdx), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm8 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm12 ; SSE-NEXT: movdqa 32(%r8), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: movdqa 48(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: movdqa 48(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm13, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 ; SSE-NEXT: movdqa 48(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm7 ; SSE-NEXT: por %xmm7, %xmm12 ; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: movdqa 48(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] @@ -1713,244 +1714,244 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm12 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm13 -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,1,0,1] -; SSE-NEXT: pandn %xmm6, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm14, %xmm10 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,1,3,2,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] ; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm12, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm14, 304(%r9) -; SSE-NEXT: movdqa %xmm15, 288(%r9) -; SSE-NEXT: movdqa %xmm2, 256(%r9) +; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm5[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, 304(%r9) +; SSE-NEXT: movdqa %xmm10, 288(%r9) +; SSE-NEXT: movdqa %xmm1, 256(%r9) +; SSE-NEXT: movdqa %xmm15, 240(%r9) +; SSE-NEXT: movdqa %xmm14, 224(%r9) +; SSE-NEXT: movdqa %xmm9, 208(%r9) +; SSE-NEXT: movdqa %xmm8, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%r9) -; SSE-NEXT: movdqa %xmm9, 224(%r9) -; SSE-NEXT: movdqa %xmm7, 208(%r9) -; SSE-NEXT: movdqa %xmm11, 176(%r9) -; SSE-NEXT: movdqa %xmm13, 160(%r9) +; SSE-NEXT: movaps %xmm0, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1975,134 +1976,134 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: addq $232, %rsp +; SSE-NEXT: addq $248, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $72, %rsp +; AVX1-ONLY-NEXT: subq $56, %rsp ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm5[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm4[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2,3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2,3,4],xmm7[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,2,2] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm15[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm15[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm15[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4],xmm3[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -2111,22 +2112,21 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm7[2],xmm4[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3,4],xmm7[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm2[1,2,3,4],xmm7[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm2 @@ -2135,18 +2135,18 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] @@ -2155,46 +2155,48 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm14, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm9[4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm12[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] @@ -2208,21 +2210,20 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1,2,3,4],xmm5[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2,3,4],xmm5[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2241,32 +2242,32 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) -; AVX1-ONLY-NEXT: addq $72, %rsp +; AVX1-ONLY-NEXT: addq $56, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $72, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -2274,146 +2275,148 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3],xmm10[4,5],xmm6[6],xmm10[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2],ymm9[3,4],ymm14[5,6,7,8],ymm9[9],ymm14[10],ymm9[11,12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm15, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2],ymm4[3,4],ymm15[5,6,7,8],ymm4[9],ymm15[10],ymm4[11,12],ymm15[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm12 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5],ymm15[6],ymm4[7,8],ymm15[9],ymm4[10,11],ymm15[12],ymm4[13],ymm15[14],ymm4[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3,4],ymm14[5,6,7,8],ymm13[9],ymm14[10],ymm13[11,12],ymm14[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10],ymm6[11],ymm11[12,13],ymm6[14],ymm11[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 256(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 256(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -2426,158 +2429,155 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $72, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: subq $40, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5],xmm13[6],xmm6[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5],xmm13[6],xmm7[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3,4],ymm12[5,6,7,8],ymm5[9],ymm12[10],ymm5[11,12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3],ymm14[4],ymm5[5,6],ymm14[7],ymm5[8,9],ymm14[10],ymm5[11],ymm14[12],ymm5[13,14],ymm14[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) @@ -2586,172 +2586,168 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $72, %rsp +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5],xmm13[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5],xmm13[6],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3,4],ymm12[5,6,7,8],ymm5[9],ymm12[10],ymm5[11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3],ymm14[4],ymm5[5,6],ymm14[7],ymm5[8,9],ymm14[10],ymm5[11],ymm14[12],ymm5[13,14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) @@ -2760,15 +2756,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2940,63 +2935,63 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm2 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm2, %ymm7, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] ; AVX512F-FAST-NEXT: vprolq $16, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm0 @@ -3004,80 +2999,80 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[1,1,1,2,5,5,5,6] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm19 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm12 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm12 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,2,3,3,7,6,7,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3,4],ymm2[5,6,7,8],ymm5[9],ymm2[10],ymm5[11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm10 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[2,3,2,3,6,7,6,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[1,1,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm5[1,1,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm12[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,2,1,4,5,6,5] ; AVX512F-FAST-NEXT: vprolq $16, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3],ymm14[4],ymm8[5,6],ymm14[7],ymm8[8,9],ymm14[10],ymm8[11],ymm14[12],ymm8[13,14],ymm14[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm6 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm8 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1],ymm14[2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm13, %ymm15, %ymm13 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm13 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm13[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm19, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm17[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm7 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm19 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm17[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm3 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 @@ -3085,12 +3080,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm4 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -3169,36 +3164,35 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i16_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $616, %rsp # imm = 0x268 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm14 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm15 +; SSE-NEXT: movdqa (%rsi), %xmm12 ; SSE-NEXT: movdqa 16(%rsi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm13 +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%r8), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: por %xmm3, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm8 @@ -3206,24 +3200,24 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: por %xmm8, %xmm6 ; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: por %xmm3, %xmm6 @@ -3236,13 +3230,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 32(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm6 ; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 @@ -3250,11 +3244,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa 32(%rdx), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: movdqa 32(%rdx), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm10, %xmm11 ; SSE-NEXT: por %xmm11, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: por %xmm3, %xmm8 @@ -3267,13 +3261,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: por %xmm3, %xmm8 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm8, %xmm3 @@ -3281,12 +3275,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm8, %xmm11 ; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: por %xmm13, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm15, %xmm11 ; SSE-NEXT: pand %xmm1, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 ; SSE-NEXT: pand %xmm2, %xmm11 @@ -3298,29 +3292,29 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 ; SSE-NEXT: movdqa 64(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm3, %xmm11 ; SSE-NEXT: movdqa 64(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: por %xmm13, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 ; SSE-NEXT: pand %xmm2, %xmm11 ; SSE-NEXT: movdqa 64(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill @@ -3330,13 +3324,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa 80(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 @@ -3344,31 +3338,31 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 ; SSE-NEXT: movdqa 80(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm2, %xmm15 ; SSE-NEXT: movdqa 80(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: movdqa 96(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 @@ -3376,46 +3370,46 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 ; SSE-NEXT: movdqa 96(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm2, %xmm15 ; SSE-NEXT: movdqa 96(%r8), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: movdqa 112(%rsi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: pand %xmm9, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa 112(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa 112(%r8), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3423,417 +3417,419 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,0,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,1,1] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,1,1] +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm12[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,0,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] ; SSE-NEXT: pand %xmm4, %xmm7 ; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm7 ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 ; SSE-NEXT: por %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,3,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 ; SSE-NEXT: por %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,1] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pand %xmm9, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pand %xmm10, %xmm8 ; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -3842,67 +3838,67 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,1,1] -; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: pand %xmm4, %xmm14 ; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: pandn %xmm1, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,7,6] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -3911,81 +3907,80 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,1,1] +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm5[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm6[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm9, 624(%r9) -; SSE-NEXT: movdqa %xmm13, 608(%r9) -; SSE-NEXT: movdqa %xmm0, 576(%r9) -; SSE-NEXT: movdqa %xmm10, 560(%r9) -; SSE-NEXT: movdqa %xmm15, 544(%r9) +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm10, 624(%r9) +; SSE-NEXT: movdqa %xmm0, 608(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 576(%r9) +; SSE-NEXT: movdqa %xmm2, 560(%r9) +; SSE-NEXT: movdqa %xmm12, 544(%r9) ; SSE-NEXT: movdqa %xmm11, 528(%r9) ; SSE-NEXT: movdqa %xmm14, 496(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 480(%r9) +; SSE-NEXT: movdqa %xmm15, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%r9) @@ -4045,46 +4040,46 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i16_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm10[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm9 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 @@ -4092,79 +4087,79 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm13 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm14[1],xmm2[2,3,4,5],xmm14[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm14[1],xmm4[2,3,4,5],xmm14[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm8 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm8[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm4[2],xmm7[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4],xmm4[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4],xmm5[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 @@ -4174,21 +4169,21 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] @@ -4202,8 +4197,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -4211,63 +4206,63 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm9[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm13[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3,4],xmm7[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] @@ -4276,7 +4271,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] @@ -4292,21 +4287,21 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] @@ -4320,72 +4315,72 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm4[1,2,3,4],xmm6[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm11[1],xmm13[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3,4],xmm7[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm5[2],xmm0[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] @@ -4402,9 +4397,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] @@ -4420,8 +4415,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -4434,30 +4429,30 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -4465,56 +4460,56 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3],xmm7[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm10[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm6[2],xmm12[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4],xmm6[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%r9) @@ -4541,9 +4536,9 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 624(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 608(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) @@ -4592,346 +4587,345 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i16_stride5_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm6, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm7, %ymm1, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm15, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm6[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3],ymm14[4],ymm0[5,6],ymm14[7],ymm0[8,9],ymm14[10],ymm0[11],ymm14[12],ymm0[13,14],ymm14[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3,4],ymm3[5,6,7,8],ymm8[9],ymm3[10],ymm8[11,12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10],ymm12[11],ymm3[12,13],ymm12[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3,4],ymm12[5,6,7,8],ymm14[9],ymm12[10],ymm14[11,12],ymm12[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm15[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3,4],ymm14[5,6,7,8],ymm15[9],ymm14[10],ymm15[11,12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm12, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3,4],ymm5[5,6,7,8],ymm9[9],ymm5[10],ymm9[11,12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10],ymm9[11],ymm5[12,13],ymm9[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3,4],ymm15[5,6,7,8],ymm0[9],ymm15[10],ymm0[11,12],ymm15[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm11[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5],ymm0[6],ymm8[7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13],ymm0[14],ymm8[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10],ymm12[11],ymm3[12,13],ymm12[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm10, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3,4],ymm9[5,6,7,8],ymm14[9],ymm9[10],ymm14[11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13],ymm9[14],ymm15[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm9, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm7, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 544(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 384(%r9) +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 544(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 384(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 608(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%r9) @@ -4949,8 +4943,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 128(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4970,344 +4963,337 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm13, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 96(%r8), %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 96(%r8), %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5],xmm11[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3],ymm12[4],ymm3[5,6],ymm12[7],ymm3[8,9],ymm12[10],ymm3[11],ymm12[12],ymm3[13,14],ymm12[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 80(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq 80(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3,4],ymm15[5,6,7,8],ymm14[9],ymm15[10],ymm14[11,12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3,4],ymm14[5,6,7,8],ymm11[9],ymm14[10],ymm11[11,12],ymm14[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3,4],ymm14[5,6,7,8],ymm10[9],ymm14[10],ymm10[11,12],ymm14[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3,4],ymm7[5,6,7,8],ymm12[9],ymm7[10],ymm12[11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 88(%r8), %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 120(%r8), %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 88(%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 120(%r8), %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm11 ; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10,11],ymm15[12],ymm0[13],ymm15[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7,8],ymm12[9],ymm1[10],ymm12[11],ymm1[12,13],ymm12[14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm13[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm2, 544(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 384(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 608(%r9) +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 544(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 384(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 608(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5320,7 +5306,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 512(%r9) @@ -5345,344 +5331,337 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm13, %ymm15, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm14, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm15, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 96(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 96(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5],xmm11[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm8, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3],ymm12[4],ymm3[5,6],ymm12[7],ymm3[8,9],ymm12[10],ymm3[11],ymm12[12],ymm3[13,14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3,4],ymm15[5,6,7,8],ymm14[9],ymm15[10],ymm14[11,12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3,4],ymm14[5,6,7,8],ymm11[9],ymm14[10],ymm11[11,12],ymm14[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2],ymm11[3,4],ymm15[5,6,7,8],ymm11[9],ymm15[10],ymm11[11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2],ymm10[3,4],ymm14[5,6,7,8],ymm10[9],ymm14[10],ymm10[11,12],ymm14[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3,4],ymm7[5,6,7,8],ymm12[9],ymm7[10],ymm12[11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 88(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 120(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 88(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 120(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10,11],ymm15[12],ymm0[13],ymm15[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7,8],ymm12[9],ymm1[10],ymm12[11],ymm1[12,13],ymm12[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm13[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 544(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 384(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 608(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm11, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm10, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 544(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 384(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 608(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5695,7 +5674,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 512(%r9) @@ -5719,673 +5698,675 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: subq $616, %rsp # imm = 0x268 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm13 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdx), %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,0,3,0,7,4,7,4] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm21 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm21[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3],xmm9[4],xmm4[5],xmm9[6],xmm4[7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm1[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm5, %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm27 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm31 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4],ymm5[5,6,7,8],ymm4[9],ymm5[10],ymm4[11,12],ymm5[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm24 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4],ymm4[5,6,7,8],ymm3[9],ymm4[10],ymm3[11,12],ymm4[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm28 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3,4],ymm4[5,6,7,8],ymm0[9],ymm4[10],ymm0[11,12],ymm4[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2],xmm11[3],xmm4[4,5],xmm11[6],xmm4[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2],ymm9[3],ymm2[4,5],ymm9[6],ymm2[7,8],ymm9[9],ymm2[10],ymm9[11],ymm2[12,13],ymm9[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm18[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5],ymm8[6],ymm11[7,8],ymm8[9],ymm11[10,11],ymm8[12],ymm11[13],ymm8[14],ymm11[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,1,1,1] -; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4,5],xmm8[6],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm4[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5],xmm14[6],xmm0[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] +; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm7, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] ; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2],ymm8[3,4],ymm7[5,6,7,8],ymm8[9],ymm7[10],ymm8[11,12],ymm7[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm4, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13],ymm4[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm1, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1],ymm0[2],ymm6[3],ymm0[4],ymm6[5,6],ymm0[7],ymm6[8,9],ymm0[10],ymm6[11],ymm0[12],ymm6[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm14, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[1,1,2,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm21[1,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8,9],ymm3[10],ymm1[11],ymm3[12],ymm1[13,14],ymm3[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 ; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm1, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandnq %ymm0, %ymm21, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm27 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm28[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm17[2,3,2,2] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm11, %ymm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3],ymm11[4],ymm14[5,6],ymm11[7],ymm14[8,9],ymm11[10],ymm14[11],ymm11[12],ymm14[13,14],ymm11[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm31[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10,11],ymm9[12],ymm5[13],ymm9[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm9, %zmm18, %zmm17 -; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm9 -; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm20 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm20, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm11, %zmm17 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm18, %zmm13 -; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm15, %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm20[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm19[2,3,2,2] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm15, %ymm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm13, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm24[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm1[1],ymm15[2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10],ymm1[11],ymm15[12,13],ymm1[14],ymm15[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm24[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13],ymm9[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm24, %zmm17 +; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm22 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm2, %zmm17 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm24, %zmm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm3 ; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm17 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm20, %zmm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm22[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm27[0,1,0,1] ; AVX512F-SLOW-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,0,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpandn %ymm12, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm12, %ymm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm25[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm24[0,1,0,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 +; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm15 +; AVX512F-SLOW-NEXT: vpandnq %ymm15, %ymm21, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm30[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm28[0,1,0,0] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm19[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm15[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm26[2,2,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm29 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm11, %zmm19, %zmm26 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm11, %zmm29, %zmm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm13 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm17, %zmm13 -; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm13 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm17, %zmm19 -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm12, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm12, %zmm17, %zmm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm26[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm23[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm10[2,2,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm29 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm15, %zmm10, %zmm23 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm15, %zmm29, %zmm25 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm19, %zmm15 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm17 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm15, %zmm19, %zmm17 +; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm15, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm17 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm19, %zmm21 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm17 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm15, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm27, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm14, %zmm19, %zmm8 ; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm15, %zmm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm17, %zmm3 +; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm19, %zmm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm15, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm2, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm24, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm24, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm11, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm13, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 384(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 576(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 320(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 448(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 512(%r9) -; AVX512F-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 192(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 448(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 512(%r9) +; AVX512F-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-FAST-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm9 ; AVX512F-FAST-NEXT: vmovdqa64 96(%rdx), %ymm20 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm15 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,2,2,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm13 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,2,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm10 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm12[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] ; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512F-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandnq %ymm2, %ymm22, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm24[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm27 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm30 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm30[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm30[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpandn %ymm3, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpandnq %ymm4, %ymm22, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm10, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm0 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm29 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7,8],ymm8[9],ymm0[10],ymm8[11],ymm0[12,13],ymm8[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm7, %ymm8 +; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3],ymm0[4],ymm8[5,6],ymm0[7],ymm8[8,9],ymm0[10],ymm8[11],ymm0[12],ymm8[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10],ymm7[11],ymm0[12,13],ymm7[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm27 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3,4],ymm2[5,6,7,8],ymm6[9],ymm2[10],ymm6[11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8],ymm10[9],ymm4[10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm30 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm7[1],ymm2[2],ymm7[3,4],ymm2[5,6,7,8],ymm7[9],ymm2[10],ymm7[11,12],ymm2[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5],ymm1[6],ymm11[7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13],ymm1[14],ymm11[15] -; AVX512F-FAST-NEXT: vprolq $16, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm11[2],ymm5[3],ymm11[4],ymm5[5,6],ymm11[7],ymm5[8,9],ymm11[10],ymm5[11],ymm11[12],ymm5[13,14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm9[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15] +; AVX512F-FAST-NEXT: vprolq $16, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3],ymm9[4],ymm0[5,6],ymm9[7],ymm0[8,9],ymm9[10],ymm0[11],ymm9[12],ymm0[13,14],ymm9[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8],ymm0[9],ymm9[10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13],ymm9[14],ymm1[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm9 ; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm31, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm11 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm30 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5],ymm0[6],ymm11[7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13],ymm0[14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm17[0,1,0,0] -; AVX512F-FAST-NEXT: vprolq $16, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3],ymm15[4],ymm9[5,6],ymm15[7],ymm9[8,9],ymm15[10],ymm9[11],ymm15[12],ymm9[13,14],ymm15[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13],ymm9[14],ymm15[15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10,11],ymm15[12],ymm3[13],ymm15[14],ymm3[15] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm29[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5],ymm0[6],ymm12[7,8],ymm0[9],ymm12[10,11],ymm0[12],ymm12[13],ymm0[14],ymm12[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm16[0,1,0,0] +; AVX512F-FAST-NEXT: vprolq $16, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm10[2],ymm3[3],ymm10[4],ymm3[5,6],ymm10[7],ymm3[8,9],ymm10[10],ymm3[11],ymm10[12],ymm3[13,14],ymm10[15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7,8],ymm3[9],ymm10[10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm11[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5],ymm11[6],ymm4[7,8],ymm11[9],ymm4[10,11],ymm11[12],ymm4[13],ymm11[14],ymm4[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm4[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpbroadcastq 80(%r8), %ymm20 -; AVX512F-FAST-NEXT: vpandnq %ymm20, %ymm31, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm16 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm9, %zmm16, %zmm20 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm9, %zmm18, %zmm29 -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpandnq %ymm20, %ymm30, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm26 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm28 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm1, %zmm26, %zmm29 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm1, %zmm28, %zmm27 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm1 = mem[2,3,2,3] ; AVX512F-FAST-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm16 = mem[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm17 = mem[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq $186, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm18 = mem[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm20 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm8[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm31[2,3,2,2] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm25[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm24[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm22[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm21[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm19[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm19, %zmm19 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm22, %zmm21 -; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm19 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm21, %zmm19, %zmm15 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm21, %zmm11 -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm22, %zmm21 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm21, %zmm19, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm22, %zmm11 -; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm22[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm19[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm18[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm17, %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm17, %zmm11 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm18, %zmm12 +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm17, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm12 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm19, %zmm12 +; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm1 ; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm16 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm11, %zmm16, %zmm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm26, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm28, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm11, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm22, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm16, %zmm2 -; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm8 -; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm12, %zmm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm11, %zmm6 -; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm16, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm26, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm28, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm12, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm21, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm14 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm19, %zmm14 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm6 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm14, %zmm16, %zmm6 +; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm14 +; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm16 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm16, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm12, %zmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm12, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm23, %zmm11, %zmm27 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm11, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm16, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm12, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm12, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm27, %zmm30 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 384(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 448(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 576(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 192(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 320(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 512(%r9) -; AVX512F-FAST-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm9, %zmm13 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm3, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 448(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 320(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 512(%r9) +; AVX512F-FAST-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index c20981d0d9398..66fdffccdfddc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -875,37 +875,36 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 16(%rcx), %xmm2 ; SSE-NEXT: movdqa 16(%r8), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; SSE-NEXT: movdqa %xmm11, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm6[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm0[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm8, %xmm0 +; SSE-NEXT: andnps %xmm9, %xmm0 ; SSE-NEXT: orps %xmm5, %xmm0 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE-NEXT: movdqa %xmm15, %xmm13 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[3,3] ; SSE-NEXT: movdqa (%r8), %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm8[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm9[2,3] ; SSE-NEXT: movdqa (%r9), %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: andnps %xmm1, %xmm0 @@ -917,11 +916,11 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm12[3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: andnps %xmm4, %xmm0 @@ -938,41 +937,41 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: andnps %xmm3, %xmm7 ; SSE-NEXT: orps %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[0,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm11[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm8, %xmm13 ; SSE-NEXT: andps %xmm1, %xmm3 ; SSE-NEXT: por %xmm3, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] ; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 ; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: por %xmm4, %xmm9 ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[1,1,1,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm14[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: andps %xmm3, %xmm4 @@ -1001,33 +1000,33 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] ; SSE-NEXT: andps %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm1, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[0,2] -; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm10[0,2] +; SSE-NEXT: andps %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslld $16, %xmm2 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm1, 48(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) @@ -1035,7 +1034,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm11, 160(%rax) ; SSE-NEXT: movdqa %xmm15, (%rax) ; SSE-NEXT: movdqa %xmm8, 16(%rax) -; SSE-NEXT: movdqa %xmm5, 64(%rax) +; SSE-NEXT: movdqa %xmm9, 64(%rax) ; SSE-NEXT: movdqa %xmm13, 144(%rax) ; SSE-NEXT: movaps %xmm7, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1049,93 +1048,93 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX1-ONLY-NEXT: vpslld $16, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 +; AVX1-ONLY-NEXT: vpslld $16, %xmm11, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm13 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1],xmm10[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1],xmm12[2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0],xmm2[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1,2],xmm15[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm3[0,1],xmm11[0],xmm3[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm2[0,1],xmm10[0],xmm2[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2],xmm9[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -1144,28 +1143,28 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm13[0],xmm3[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm14[0],xmm3[3] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1176,7 +1175,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm15, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 80(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) @@ -1189,50 +1188,50 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm11 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm12 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] @@ -1240,9 +1239,9 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] @@ -1254,25 +1253,25 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] @@ -1280,15 +1279,15 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 @@ -1311,30 +1310,30 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vpbroadcastq %xmm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm14 ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm12 @@ -1353,14 +1352,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2] ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] @@ -1368,26 +1367,26 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,0,7,6,5,0,7,6] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,0,7,6,5,0,7,6] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] @@ -1421,120 +1420,128 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: subq $24, %rsp ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $24, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1751,67 +1758,65 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $280, %rsp # imm = 0x118 -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: subq $312, %rsp # imm = 0x138 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm5 ; SSE-NEXT: movdqa 16(%rdx), %xmm14 ; SSE-NEXT: movdqa (%rcx), %xmm4 ; SSE-NEXT: movdqa 16(%rcx), %xmm10 -; SSE-NEXT: movdqa (%r8), %xmm7 +; SSE-NEXT: movdqa (%r8), %xmm8 ; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm3[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm7[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: andps %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm7 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm11, %xmm0 -; SSE-NEXT: orps %xmm8, %xmm0 +; SSE-NEXT: orps %xmm9, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm12[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm4, %xmm0 -; SSE-NEXT: orps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm8[0,1] -; SSE-NEXT: movdqa 16(%r9), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 16(%r8), %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm9[0,1] +; SSE-NEXT: movdqa 16(%r9), %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm12, %xmm0 @@ -1820,292 +1825,293 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: orps %xmm11, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm14[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm10[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm9 -; SSE-NEXT: orps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm1 -; SSE-NEXT: movdqa 32(%rcx), %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa 32(%rdx), %xmm2 +; SSE-NEXT: movdqa 32(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa 32(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3] -; SSE-NEXT: movdqa 32(%r8), %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1] -; SSE-NEXT: movdqa 32(%r9), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 32(%r8), %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm13[0,1] +; SSE-NEXT: movdqa 32(%r9), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: andnps %xmm0, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm9[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: andnps %xmm9, %xmm10 +; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: andnps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm12 +; SSE-NEXT: orps %xmm12, %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: andnps %xmm1, %xmm11 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: orps %xmm0, %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm3 ; SSE-NEXT: movdqa 48(%rcx), %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[3,3] -; SSE-NEXT: movdqa 48(%r8), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[0,1] -; SSE-NEXT: movdqa 48(%r9), %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm14, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[3,3] +; SSE-NEXT: movdqa 48(%r8), %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm13[0,1] +; SSE-NEXT: movdqa 48(%r9), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: andnps %xmm13, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm11 +; SSE-NEXT: orps %xmm11, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: andps %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; SSE-NEXT: andnps %xmm9, %xmm6 -; SSE-NEXT: orps %xmm2, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: andnps %xmm1, %xmm6 +; SSE-NEXT: orps %xmm0, %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: andps %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[0,2] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: andps %xmm5, %xmm14 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: andps %xmm5, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm14[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm0[1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: andps %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm15[1] -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: andps %xmm5, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: andps %xmm5, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[0,2] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: andps %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: andps %xmm1, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 ; SSE-NEXT: andps %xmm5, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 -; SSE-NEXT: andps %xmm9, %xmm7 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm14, %xmm7 -; SSE-NEXT: andps %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm14, %xmm3 -; SSE-NEXT: andps %xmm9, %xmm8 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: andps %xmm1, %xmm8 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm11, %xmm8 -; SSE-NEXT: andps %xmm5, %xmm14 -; SSE-NEXT: por %xmm14, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm12[0] -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm12[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] +; SSE-NEXT: andps %xmm5, %xmm15 +; SSE-NEXT: por %xmm15, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm10[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: andps %xmm1, %xmm9 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm13, %xmm9 +; SSE-NEXT: andps %xmm5, %xmm11 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: andps %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[0,2] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm15, %xmm12 -; SSE-NEXT: andps %xmm5, %xmm13 -; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm10 +; SSE-NEXT: andps %xmm5, %xmm14 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[0,2] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm15, %xmm13 -; SSE-NEXT: andps %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: andps %xmm5, %xmm15 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm14, %xmm13 +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: andps %xmm5, %xmm14 +; SSE-NEXT: por %xmm14, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[0,2] -; SSE-NEXT: andps %xmm9, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,3,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm14[0,2] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm5, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] +; SSE-NEXT: andps %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm14, %xmm5 +; SSE-NEXT: por %xmm12, %xmm5 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm5, 352(%rax) -; SSE-NEXT: movdqa %xmm9, 336(%rax) -; SSE-NEXT: movdqa %xmm14, 304(%rax) +; SSE-NEXT: movdqa %xmm1, 336(%rax) +; SSE-NEXT: movdqa %xmm11, 304(%rax) ; SSE-NEXT: movdqa %xmm13, 288(%rax) -; SSE-NEXT: movdqa %xmm12, 256(%rax) -; SSE-NEXT: movdqa %xmm11, 240(%rax) -; SSE-NEXT: movdqa %xmm8, 208(%rax) -; SSE-NEXT: movdqa %xmm3, 192(%rax) -; SSE-NEXT: movdqa %xmm7, 160(%rax) -; SSE-NEXT: movdqa %xmm6, 144(%rax) +; SSE-NEXT: movdqa %xmm10, 256(%rax) +; SSE-NEXT: movdqa %xmm15, 240(%rax) +; SSE-NEXT: movdqa %xmm9, 208(%rax) +; SSE-NEXT: movdqa %xmm6, 192(%rax) +; SSE-NEXT: movdqa %xmm8, 160(%rax) +; SSE-NEXT: movdqa %xmm7, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2134,7 +2140,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $280, %rsp # imm = 0x118 +; SSE-NEXT: addq $312, %rsp # imm = 0x138 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf32: @@ -2146,133 +2152,133 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm12[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm2 +; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm13[2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm13[2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm12[0],xmm5[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4],xmm0[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm7 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0],xmm5[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 @@ -2317,28 +2323,28 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm13[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] @@ -2358,39 +2364,39 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0],xmm6[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0],xmm7[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm9[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%rax) @@ -2433,24 +2439,22 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: subq $616, %rsp # imm = 0x268 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] @@ -2460,274 +2464,267 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[12],ymm13[12],ymm2[13],ymm13[13],ymm2[14],ymm13[14],ymm2[15],ymm13[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[2],mem[2],ymm5[3],mem[3],ymm5[8],mem[8],ymm5[9],mem[9],ymm5[10],mem[10],ymm5[11],mem[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $616, %rsp # imm = 0x268 +; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 @@ -2751,22 +2748,23 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm12 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -2775,19 +2773,20 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm9 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2797,11 +2796,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -2811,202 +2811,202 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[2],ymm1[2],ymm9[3],ymm1[3],ymm9[8],ymm1[8],ymm9[9],ymm1[9],ymm9[10],ymm1[10],ymm9[11],ymm1[11] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm0[1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,2,1,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,2,1,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm4[4],ymm10[5],ymm4[5],ymm10[6],ymm4[6],ymm10[7],ymm4[7],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 288(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 @@ -3036,16 +3036,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3055,45 +3055,45 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3102,169 +3102,165 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm11 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm8[2],ymm14[3,4],ymm8[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[12],ymm10[12],ymm12[13],ymm10[13],ymm12[14],ymm10[14],ymm12[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm14[4],ymm8[4],ymm14[5],ymm8[5],ymm14[6],ymm8[6],ymm14[7],ymm8[7],ymm14[12],ymm8[12],ymm14[13],ymm8[13],ymm14[14],ymm8[14],ymm14[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm13, %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm8[0],ymm14[1],ymm8[1],ymm14[2],ymm8[2],ymm14[3],ymm8[3],ymm14[8],ymm8[8],ymm14[9],ymm8[9],ymm14[10],ymm8[10],ymm14[11],ymm8[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[8],ymm14[8],ymm2[9],ymm14[9],ymm2[10],ymm14[10],ymm2[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3273,13 +3269,13 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3288,7 +3284,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm23 @@ -3303,12 +3299,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm12[0],ymm5[1],ymm12[1],ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[8],ymm12[8],ymm5[9],ymm12[9],ymm5[10],ymm12[10],ymm5[11],ymm12[11] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 @@ -3316,58 +3312,58 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm30 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm0[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm16[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm10[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm10[0,1,2,3],zmm9[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] @@ -3380,7 +3376,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm13[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm13[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] @@ -3388,99 +3384,99 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm2[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm2[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm12[4],ymm5[5],ymm12[5],ymm5[6],ymm12[6],ymm5[7],ymm12[7],ymm5[12],ymm12[12],ymm5[13],ymm12[13],ymm5[14],ymm12[14],ymm5[15],ymm12[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm31[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm4[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm9[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm14[4],ymm10[4],ymm14[5],ymm10[5],ymm14[6],ymm10[6],ymm14[7],ymm10[7],ymm14[12],ymm10[12],ymm14[13],ymm10[13],ymm14[14],ymm10[14],ymm14[15],ymm10[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm9[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm29[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[2,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -3490,55 +3486,55 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm24[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm23[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm23[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm22, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -3546,16 +3542,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: pushq %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm20, %zmm23 @@ -3565,212 +3561,212 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,9,2,3,8,5,6,11] ; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm16, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm0[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm0[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[8],ymm11[8],ymm0[9],ymm11[9],ymm0[10],ymm11[10],ymm0[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm25, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm16, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm20, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm25, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm10, %ymm16, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm4[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm1[4],ymm15[5],ymm1[5],ymm15[6],ymm1[6],ymm15[7],ymm1[7],ymm15[12],ymm1[12],ymm15[13],ymm1[13],ymm15[14],ymm1[14],ymm15[15],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm24, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[8],ymm3[8],ymm13[9],ymm3[9],ymm13[10],ymm3[10],ymm13[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm11[4],ymm0[5],ymm11[5],ymm0[6],ymm11[6],ymm0[7],ymm11[7],ymm0[12],ymm11[12],ymm0[13],ymm11[13],ymm0[14],ymm11[14],ymm0[15],ymm11[15] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm2[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm16, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm31, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm24, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm15, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm26, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[12],ymm0[12],ymm13[13],ymm0[13],ymm13[14],ymm0[14],ymm13[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm13[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm12[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm10[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm9[2,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm5, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm5, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm31, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm1[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm26, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm1[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm8, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm7, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm14[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm23[0,1,2,3],zmm21[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm3, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm18, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm9, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm19, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm25[0,1,2,3],zmm22[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm19[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm26, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm15, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm16[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm7, %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm10[0,1,2,3],zmm2[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm19, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm16[0,1,2,3],zmm17[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm28, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm15[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm12[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm6, %ymm5, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm3[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: popq %rax ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -3780,7 +3776,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 @@ -3799,16 +3795,16 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm12[0],ymm5[1],ymm12[1],ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[8],ymm12[8],ymm5[9],ymm12[9],ymm5[10],ymm12[10],ymm5[11],ymm12[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 @@ -3818,51 +3814,51 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm30 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,2,3,3] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm0[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm16, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm16, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm9 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm9 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm10 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm7[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3],ymm13[4],ymm6[5,6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm6[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero @@ -3870,161 +3866,161 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm13[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm4 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm9, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm9[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm9[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm2[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm12[4],ymm5[5],ymm12[5],ymm5[6],ymm12[6],ymm5[7],ymm12[7],ymm5[12],ymm12[12],ymm5[13],ymm12[13],ymm5[14],ymm12[14],ymm5[15],ymm12[15] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm14[4],ymm9[4],ymm14[5],ymm9[5],ymm14[6],ymm9[6],ymm14[7],ymm9[7],ymm14[12],ymm9[12],ymm14[13],ymm9[13],ymm14[14],ymm9[14],ymm14[15],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm4, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm5[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm14[4],ymm9[4],ymm14[5],ymm9[5],ymm14[6],ymm9[6],ymm14[7],ymm9[7],ymm14[12],ymm9[12],ymm14[13],ymm9[13],ymm14[14],ymm9[14],ymm14[15],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm5, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm1[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 ; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm28[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm27[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm28[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm27[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm5, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm6, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm5 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -4044,209 +4040,212 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm3, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm2, %zmm22 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm21, %zmm20 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm21, %zmm20 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm9, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm4[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm9, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm15[4],ymm5[4],ymm15[5],ymm5[5],ymm15[6],ymm5[6],ymm15[7],ymm5[7],ymm15[12],ymm5[12],ymm15[13],ymm5[13],ymm15[14],ymm5[14],ymm15[15],ymm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm13 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm13[4],ymm4[5],ymm13[5],ymm4[6],ymm13[6],ymm4[7],ymm13[7],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[2],ymm2[2],ymm10[3],ymm2[3],ymm10[8],ymm2[8],ymm10[9],ymm2[9],ymm10[10],ymm2[10],ymm10[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[8],ymm5[8],ymm12[9],ymm5[9],ymm12[10],ymm5[10],ymm12[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 ; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm23, %zmm21 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm9, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm24 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm0[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm26, %ymm5 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm25, %ymm7 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm25, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm26, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm14, %ymm28 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm26, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm5[0],ymm15[1],ymm5[1],ymm15[2],ymm5[2],ymm15[3],ymm5[3],ymm15[8],ymm5[8],ymm15[9],ymm5[9],ymm15[10],ymm5[10],ymm15[11],ymm5[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm8, %zmm15 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm14 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm7, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm25, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm14, %ymm25 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm28, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm6, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm14, %ymm24, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm14 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm13, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm13, %ymm27, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm13 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm13[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX512DQ-FAST-NEXT: vpermd %zmm7, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm11 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[2],mem[2],ymm13[3],mem[3],ymm13[8],mem[8],ymm13[9],mem[9],ymm13[10],mem[10],ymm13[11],mem[11] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm12, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm22[0,1,2,3],zmm20[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm13, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm14, %zmm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm23[0,1,2,3],zmm21[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm16 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm14, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm12, %ymm6, %ymm7 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,2,3],zmm25[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm27[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm29, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm15[0,1,2,3],zmm10[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm2, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm2, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm27, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm22[0,1,2,3],zmm20[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm16, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm23[0,1,2,3],zmm21[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm16, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm24[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm25, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm26[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm12[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm6[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512DQ-FAST-NEXT: addq $40, %rsp ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -4358,487 +4357,498 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $792, %rsp # imm = 0x318 +; SSE-NEXT: subq $808, %rsp # imm = 0x328 ; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm11 -; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa 16(%rcx), %xmm4 +; SSE-NEXT: movdqa 16(%rcx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[3,3] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm5[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: andps %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm8, %xmm0 +; SSE-NEXT: orps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: andnps %xmm7, %xmm5 -; SSE-NEXT: orps %xmm0, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm12[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm6, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm11[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[0,1] +; SSE-NEXT: movdqa 16(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: andnps %xmm3, %xmm5 -; SSE-NEXT: orps %xmm0, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm2 +; SSE-NEXT: movdqa 32(%rcx), %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[3,3] -; SSE-NEXT: movdqa 16(%r8), %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[0,1] -; SSE-NEXT: movdqa 16(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: andnps %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm8[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm0 -; SSE-NEXT: movdqa 32(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[3,3] -; SSE-NEXT: movdqa 32(%r8), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 32(%r8), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm8[0,1] -; SSE-NEXT: movdqa 32(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm8, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm8, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm7 -; SSE-NEXT: orps %xmm7, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: orps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm6[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: andnps %xmm6, %xmm7 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm6, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm2 ; SSE-NEXT: movdqa 48(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm3[3,3] +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] ; SSE-NEXT: movdqa 48(%r8), %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm11[0,1] -; SSE-NEXT: movdqa 48(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm11, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm8 -; SSE-NEXT: orps %xmm8, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: orps %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm7[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm8 -; SSE-NEXT: andnps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdx), %xmm2 ; SSE-NEXT: movdqa 64(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa 64(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm11 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3] ; SSE-NEXT: movdqa 64(%r8), %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,2],xmm12[0,1] -; SSE-NEXT: movdqa 64(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm12, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm11 -; SSE-NEXT: orps %xmm11, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm8[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm8, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm8, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdx), %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdx), %xmm2 ; SSE-NEXT: movdqa 80(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa 80(%rsi), %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[3,3] -; SSE-NEXT: movdqa 80(%r8), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3] +; SSE-NEXT: movdqa 80(%r8), %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm15[0,1] -; SSE-NEXT: movdqa 80(%r9), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%r9), %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm15, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm12 -; SSE-NEXT: orps %xmm12, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: orps %xmm12, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,6,5,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm11[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm11, %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: andnps %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdx), %xmm2 ; SSE-NEXT: movdqa 96(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm3 ; SSE-NEXT: movdqa 96(%rsi), %xmm12 -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3] ; SSE-NEXT: movdqa 96(%r8), %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm11[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1] -; SSE-NEXT: movdqa 96(%r9), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm13, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm0[0,1] +; SSE-NEXT: movdqa 96(%r9), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: movaps %xmm14, %xmm13 +; SSE-NEXT: andnps %xmm0, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm15 -; SSE-NEXT: orps %xmm15, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: orps %xmm15, %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm12[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: andnps %xmm12, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm0 -; SSE-NEXT: movdqa 112(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa 112(%rsi), %xmm13 -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: andnps %xmm1, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdx), %xmm4 +; SSE-NEXT: movdqa 112(%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm8[3,3] -; SSE-NEXT: movdqa 112(%r8), %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm10[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[3,3] +; SSE-NEXT: movdqa 112(%r8), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1] ; SSE-NEXT: movdqa 112(%r9), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] ; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: andnps %xmm10, %xmm12 +; SSE-NEXT: andnps %xmm13, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] ; SSE-NEXT: andps %xmm14, %xmm15 ; SSE-NEXT: orps %xmm15, %xmm12 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm4[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; SSE-NEXT: andnps %xmm10, %xmm14 -; SSE-NEXT: orps %xmm1, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: andnps %xmm1, %xmm14 +; SSE-NEXT: orps %xmm0, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: andps %xmm8, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm13[0,2] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: andps %xmm8, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[0,2] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm2[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm9 +; SSE-NEXT: andps %xmm10, %xmm9 ; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[0,2] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] @@ -4847,9 +4857,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] @@ -4857,11 +4867,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -4871,18 +4881,18 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm6 +; SSE-NEXT: andps %xmm10, %xmm6 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -4896,9 +4906,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] @@ -4906,9 +4916,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: andps %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -4920,71 +4930,69 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm12, %xmm7 +; SSE-NEXT: andps %xmm10, %xmm7 ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] -; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm10, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4996,9 +5004,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,1,1,4,5,6,7] @@ -5006,10 +5014,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -5021,72 +5029,72 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] ; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm3[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: andps %xmm12, %xmm11 +; SSE-NEXT: andps %xmm10, %xmm11 ; SSE-NEXT: por %xmm11, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[1,3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] ; SSE-NEXT: movdqa %xmm15, %xmm5 ; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm11 ; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movaps %xmm1, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: andps %xmm12, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: andps %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2] -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,3,3] -; SSE-NEXT: pslld $16, %xmm9 -; SSE-NEXT: pandn %xmm9, %xmm8 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm8 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[0,2] -; SSE-NEXT: andps %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm12 -; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[0,2] +; SSE-NEXT: andps %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm12, 736(%rax) -; SSE-NEXT: movdqa %xmm8, 720(%rax) +; SSE-NEXT: movdqa %xmm10, 736(%rax) +; SSE-NEXT: movdqa %xmm12, 720(%rax) ; SSE-NEXT: movdqa %xmm15, 688(%rax) ; SSE-NEXT: movdqa %xmm11, 672(%rax) ; SSE-NEXT: movdqa %xmm4, 640(%rax) ; SSE-NEXT: movdqa %xmm6, 624(%rax) -; SSE-NEXT: movdqa %xmm10, 592(%rax) +; SSE-NEXT: movdqa %xmm9, 592(%rax) ; SSE-NEXT: movdqa %xmm13, 576(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%rax) @@ -5108,7 +5116,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%rax) @@ -5120,7 +5128,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) @@ -5167,7 +5175,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $792, %rsp # imm = 0x318 +; SSE-NEXT: addq $808, %rsp # imm = 0x328 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf64: @@ -5606,13 +5614,13 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -5647,22 +5655,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm6[0,1],xmm1[0],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm7[0,1],xmm1[0],xmm7[3] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] @@ -5679,7 +5687,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm10, 96(%rax) @@ -5771,260 +5779,262 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i16_stride6_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm14 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -6032,22 +6042,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero +; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -6056,10 +6066,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] @@ -6110,7 +6121,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -6127,206 +6138,209 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[2],mem[2],ymm13[3],mem[3],ymm13[8],mem[8],ymm13[9],mem[9],ymm13[10],mem[10],ymm13[11],mem[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[4],mem[4],ymm13[5],mem[5],ymm13[6],mem[6],ymm13[7],mem[7],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 736(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 672(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 736(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 672(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 544(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 640(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 512(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 448(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6355,23 +6369,24 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i16_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1592, %rsp # imm = 0x638 +; AVX2-FAST-NEXT: subq $1560, %rsp # imm = 0x618 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6381,223 +6396,222 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm12 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpmovzxwd (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] @@ -6607,18 +6621,20 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6631,8 +6647,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -6640,9 +6656,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -6653,18 +6667,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = mem[0,0,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm12[0],ymm1[1],ymm12[1],ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[8],ymm12[8],ymm1[9],ymm12[9],ymm1[10],ymm12[10],ymm1[11],ymm12[11] +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,2,2,5,4,6,6] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm14[0],ymm3[1],ymm14[1],ymm3[2],ymm14[2],ymm3[3],ymm14[3],ymm3[8],ymm14[8],ymm3[9],ymm14[9],ymm3[10],ymm14[10],ymm3[11],ymm14[11] +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -6692,173 +6707,174 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,2,1,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,2,1,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 736(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -6866,15 +6882,15 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %ymm0, 544(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 352(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 704(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 640(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 640(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6903,71 +6919,73 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $1592, %rsp # imm = 0x638 +; AVX2-FAST-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX2-FAST-PERLANE-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -6975,7 +6993,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -6983,18 +7001,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7007,18 +7026,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7031,7 +7051,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7040,10 +7060,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7056,7 +7076,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7064,11 +7084,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7081,19 +7101,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[8],ymm7[8],ymm0[9],ymm7[9],ymm0[10],ymm7[10],ymm0[11],ymm7[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] @@ -7106,355 +7126,354 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm12 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, (%rsp), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[8],ymm1[8],ymm7[9],ymm1[9],ymm7[10],ymm1[10],ymm7[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm12 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[8],ymm8[8],ymm12[9],ymm8[9],ymm12[10],ymm8[10],ymm12[11],ymm8[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm12 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[8],ymm8[8],ymm12[9],ymm8[9],ymm12[10],ymm8[10],ymm12[11],ymm8[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm6[4],mem[4],ymm6[5],mem[5],ymm6[6],mem[6],ymm6[7],mem[7],ymm6[12],mem[12],ymm6[13],mem[13],ymm6[14],mem[14],ymm6[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm9[4],mem[4],ymm9[5],mem[5],ymm9[6],mem[6],ymm9[7],mem[7],ymm9[12],mem[12],ymm9[13],mem[13],ymm9[14],mem[14],ymm9[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[4],mem[4],ymm13[5],mem[5],ymm13[6],mem[6],ymm13[7],mem[7],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm9[4],mem[4],ymm9[5],mem[5],ymm9[6],mem[6],ymm9[7],mem[7],ymm9[12],mem[12],ymm9[13],mem[13],ymm9[14],mem[14],ymm9[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm9[4],mem[4],ymm9[5],mem[5],ymm9[6],mem[6],ymm9[7],mem[7],ymm9[12],mem[12],ymm9[13],mem[13],ymm9[14],mem[14],ymm9[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 736(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 704(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 672(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 736(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 704(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 672(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 544(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 512(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 640(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 608(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 448(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7471,21 +7490,21 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-FAST-PERLANE-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $408, %rsp # imm = 0x198 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: subq $392, %rsp # imm = 0x188 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] @@ -7499,10 +7518,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 @@ -7518,961 +7537,958 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm0[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm5[2],ymm11[3,4],ymm5[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm10, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[8],ymm8[8],ymm12[9],ymm8[9],ymm12[10],ymm8[10],ymm12[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm6[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm11, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm6[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm6[1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm11[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm11[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm4[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm13, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm10[0,1,2,3],zmm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm11 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm14[1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm14[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm11[0,1,2,3],zmm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm13[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[8],ymm1[8],ymm14[9],ymm1[9],ymm14[10],ymm1[10],ymm14[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm8[4],ymm2[5],ymm8[5],ymm2[6],ymm8[6],ymm2[7],ymm8[7],ymm2[12],ymm8[12],ymm2[13],ymm8[13],ymm2[14],ymm8[14],ymm2[15],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm11[4],ymm2[5],ymm11[5],ymm2[6],ymm11[6],ymm2[7],ymm11[7],ymm2[12],ymm11[12],ymm2[13],ymm11[13],ymm2[14],ymm11[14],ymm2[15],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm13[4],ymm10[5],ymm13[5],ymm10[6],ymm13[6],ymm10[7],ymm13[7],ymm10[12],ymm13[12],ymm10[13],ymm13[13],ymm10[14],ymm13[14],ymm10[15],ymm13[15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm25, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm29, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm12[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm12[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm25 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1,2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm23, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm25, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm15 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm18, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm18, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm27, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm24, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm14, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $408, %rsp # imm = 0x198 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1064, %rsp # imm = 0x428 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[8],ymm14[8],ymm6[9],ymm14[9],ymm6[10],ymm14[10],ymm6[11],ymm14[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm22, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm19, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm16 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,8,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [0,9,2,3,8,5,6,11] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm8, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm27, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm22, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm19, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm8, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm27, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm11[4],ymm2[5],ymm11[5],ymm2[6],ymm11[6],ymm2[7],ymm11[7],ymm2[12],ymm11[12],ymm2[13],ymm11[13],ymm2[14],ymm11[14],ymm2[15],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm24, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm25 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm29, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm6, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm15[4],ymm11[4],ymm15[5],ymm11[5],ymm15[6],ymm11[6],ymm15[7],ymm11[7],ymm15[12],ymm11[12],ymm15[13],ymm11[13],ymm15[14],ymm11[14],ymm15[15],ymm11[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm25, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm20, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm10, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm29, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm12, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm31, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm30 ; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm19, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm23, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[8],ymm11[8],ymm15[9],ymm11[9],ymm15[10],ymm11[10],ymm15[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm22, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm27 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[8],ymm11[8],ymm15[9],ymm11[9],ymm15[10],ymm11[10],ymm15[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm26, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm31, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm27, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm0[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[2],ymm13[2],ymm8[3],ymm13[3],ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm1, %zmm21, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm15[4],ymm11[4],ymm15[5],ymm11[5],ymm15[6],ymm11[6],ymm15[7],ymm11[7],ymm15[12],ymm11[12],ymm15[13],ymm11[13],ymm15[14],ymm11[14],ymm15[15],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm20, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm27, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[12],ymm0[12],ymm7[13],ymm0[13],ymm7[14],ymm0[14],ymm7[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm8[4],ymm13[4],ymm8[5],ymm13[5],ymm8[6],ymm13[6],ymm8[7],ymm13[7],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm13[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm25, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm7[2,2,2,2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm7, %ymm18, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm8[4],ymm0[5],ymm8[5],ymm0[6],ymm8[6],ymm0[7],ymm8[7],ymm0[12],ymm8[12],ymm0[13],ymm8[13],ymm0[14],ymm8[14],ymm0[15],ymm8[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm15[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm25, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm13[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm8[4],ymm0[4],ymm8[5],ymm0[5],ymm8[6],ymm0[6],ymm8[7],ymm0[7],ymm8[12],ymm0[12],ymm8[13],ymm0[13],ymm8[14],ymm0[14],ymm8[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[2],ymm14[2],ymm4[3],ymm14[3],ymm4[8],ymm14[8],ymm4[9],ymm14[9],ymm4[10],ymm14[10],ymm4[11],ymm14[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm1, %zmm4, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm18, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm19, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm19, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm18[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm16[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm27[0,1,2,3],zmm21[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm18, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm25[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm14[0,1,2,3],zmm29[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm24[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm18 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm31, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm31, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm16[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm20[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm20 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm22[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm21[0,1,2,3],zmm17[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm24[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm28[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm27[0,1,2,3],zmm6[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm13[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm21 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm9[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm11, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # xmm22 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm26 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm27 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm2, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm22, %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm26, %ymm2, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm27, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = zmm30[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm28[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm23[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm27 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm6, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm28 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm29[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm31, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm21, %ymm4, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm22, %ymm4, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm27, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm28, %ymm4, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm30[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm23[0,1,2,3],mem[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm19[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,2,3],zmm26[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1064, %rsp # imm = 0x428 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $408, %rsp # imm = 0x198 +; AVX512DQ-SLOW-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 @@ -8481,7 +8497,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] @@ -8495,10 +8511,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 @@ -8519,1053 +8535,1045 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %xmm19 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm0[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,0,2,2,5,4,6,6] ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm6, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm11, %zmm6, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm12, %xmm16 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm10, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm11[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm11, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm8, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm13[0],ymm10[0],ymm13[1],ymm10[1],ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[8],ymm10[8],ymm13[9],ymm10[9],ymm13[10],ymm10[10],ymm13[11],ymm10[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm10, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[12],ymm12[12],ymm8[13],ymm12[13],ymm8[14],ymm12[14],ymm8[15],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[12],ymm7[12],ymm4[13],ymm7[13],ymm4[14],ymm7[14],ymm4[15],ymm7[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm11, %zmm10, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm11, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm31 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1,2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm11[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm11 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm2, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm9[1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm13, %xmm17 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm14, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm13, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm11[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[8],ymm7[8],ymm4[9],ymm7[9],ymm4[10],ymm7[10],ymm4[11],ymm7[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[8],ymm12[8],ymm8[9],ymm12[9],ymm8[10],ymm12[10],ymm8[11],ymm12[11] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,2,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm12 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm14, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm12, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5,6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm1, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm18[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm9 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm11[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm27, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm25 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm10, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm13, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm8, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm24 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm24, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm18 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm14, %zmm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm29, %zmm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm16 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm22, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm14, %zmm13 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm15 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm14, %zmm15 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm14, %zmm13 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm15 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm14, %zmm15 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm0 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm2 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $408, %rsp # imm = 0x198 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $920, %rsp # imm = 0x398 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512DQ-FAST-NEXT: subq $888, %rsp # imm = 0x378 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm15, %xmm28 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm18 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm15[0],ymm4[0],ymm15[1],ymm4[1],ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[8],ymm4[8],ymm15[9],ymm4[9],ymm15[10],ymm4[10],ymm15[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm23 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm10 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm17, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm18, %ymm12 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm21 ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm6, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm5, %zmm21 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm20, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm6, %ymm19, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm20, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm16, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm18, %ymm8 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm20, %zmm29 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm16, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm17, %ymm9 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm27 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm29 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,0,2,1] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm25 ; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm9, %zmm25 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm25 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm22, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm5 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm14 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm8, %zmm11 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[12],ymm13[12],ymm2[13],ymm13[13],ymm2[14],ymm13[14],ymm2[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm17, %ymm5 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm22, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm18, %ymm6 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm10 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm13 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm16, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm1[4],ymm8[5],ymm1[5],ymm8[6],ymm1[6],ymm8[7],ymm1[7],ymm8[12],ymm1[12],ymm8[13],ymm1[13],ymm8[14],ymm1[14],ymm8[15],ymm1[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm17, %ymm6 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm7, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm5 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm5, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm19, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm5[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm14 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm18, %ymm7 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm9, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm7 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm7, %ymm16, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm7[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm13, %xmm10 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm12[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm12 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm17 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm12, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm16, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm16 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm15, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm13 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm12 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm12 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm10 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm12, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm16 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm1[1,1,1,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm1[1,1,1,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,1,1,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm21[0,1,2,3],zmm23[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[0,1,2,3],zmm29[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm27, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm31, %zmm24 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm13, %zmm1, %zmm24 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm0[0,1,2,3],zmm20[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm19 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm13, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm15 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm25 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm11 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm30[0],zero,xmm30[1],zero,xmm30[2],zero,xmm30[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm2, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm15, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm3, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,2,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm21[0,1,2,3],zmm24[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm14, %zmm21, %zmm24 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm27[0,1,2,3],zmm29[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm14, %zmm21, %zmm27 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm17[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm21, %zmm29 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm20[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm30 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm21, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm17, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm17, %ymm11 ; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm9, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm7, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm12, %ymm20, %ymm13 -; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm25[0,1,2,3],zmm26[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm20, %ymm9 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm17, %ymm16 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm17, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm3, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm19 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm10, %zmm8, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm15, %zmm13, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm15, %ymm20 +; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm10 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm25[0,1,2,3],zmm26[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm15, %ymm19 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm7 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm3 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm31 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm25, %zmm27 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm11[0,1,2,3],zmm28[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm31 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm28[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm5, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm14 ; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,0,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm11 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm11, %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm11, %ymm15, %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm3, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm20, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm3[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm4[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm4, %ymm26 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm6, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm11[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm25, %zmm5 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm18[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm26, %zmm18 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm17, %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm16[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm15[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm29, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm7[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm28, %zmm25, %zmm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm16[0,1,2,3],zmm23[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm16, %zmm25, %zmm6 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm22[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm9 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm20[0,1,2,3],zmm21[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm19[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm8[0,1,2,3],zmm17[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm18[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm2 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) -; AVX512DQ-FAST-NEXT: addq $920, %rsp # imm = 0x398 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 704(%rax) +; AVX512DQ-FAST-NEXT: addq $888, %rsp # imm = 0x378 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm25, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm22, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm17 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm18 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm24, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm25 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm14 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 ; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm12 ; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm7, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm10, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 982c07752b6b5..fe4c2504228a2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -172,77 +172,77 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm6, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: por %xmm12, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: psrld $16, %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,65535] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] ; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 ; SSE-NEXT: psrlq $48, %xmm4 ; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: andnps %xmm3, %xmm2 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, (%rax) -; SSE-NEXT: movq %xmm5, 48(%rax) +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: andnps %xmm2, %xmm3 +; SSE-NEXT: orps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movq %xmm7, 48(%rax) ; SSE-NEXT: movdqa %xmm6, 32(%rax) -; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf4: @@ -583,53 +583,53 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i16_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm5 ; SSE-NEXT: movdqa (%rcx), %xmm11 ; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa (%r9), %xmm9 -; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: movdqa (%r9), %xmm10 +; SSE-NEXT: movdqa (%rax), %xmm2 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,2],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 ; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm12[0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm12[0,3] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm13 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; SSE-NEXT: por %xmm9, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] ; SSE-NEXT: por %xmm15, %xmm14 ; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm15[0,2] -; SSE-NEXT: andps %xmm10, %xmm14 -; SSE-NEXT: andnps %xmm13, %xmm10 -; SSE-NEXT: orps %xmm14, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,1,0,1] +; SSE-NEXT: andps %xmm9, %xmm14 +; SSE-NEXT: andnps %xmm13, %xmm9 +; SSE-NEXT: orps %xmm14, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm14, %xmm12 @@ -641,7 +641,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm12, %xmm15 @@ -653,52 +653,51 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm15, %xmm12 ; SSE-NEXT: pandn %xmm14, %xmm15 ; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] ; SSE-NEXT: por %xmm12, %xmm15 ; SSE-NEXT: psrlq $48, %xmm11 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: psrld $16, %xmm10 ; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[3,3,3,3] -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: por %xmm13, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm3[1,1] -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: psrld $16, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm12, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm2[1,1] +; SSE-NEXT: pandn %xmm13, %xmm8 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,1] @@ -706,24 +705,24 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: andps %xmm4, %xmm6 ; SSE-NEXT: andnps %xmm0, %xmm4 ; SSE-NEXT: orps %xmm6, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm12[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm1, %xmm5 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm5, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm11[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm2, %xmm5 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm5, %xmm2 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movaps %xmm4, 64(%rax) ; SSE-NEXT: movdqa %xmm15, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: movaps %xmm7, 80(%rax) -; SSE-NEXT: movdqa %xmm9, 96(%rax) +; SSE-NEXT: movdqa %xmm8, 32(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 80(%rax) +; SSE-NEXT: movdqa %xmm10, 96(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf8: @@ -1193,336 +1192,338 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $232, %rsp +; SSE-NEXT: subq $216, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm10 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa 16(%rax), %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm15 +; SSE-NEXT: movdqa 16(%rcx), %xmm1 +; SSE-NEXT: movdqa 16(%r8), %xmm8 +; SSE-NEXT: movdqa 16(%r9), %xmm7 +; SSE-NEXT: movdqa 16(%rax), %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: andnps %xmm3, %xmm5 -; SSE-NEXT: orps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: andnps %xmm1, %xmm6 +; SSE-NEXT: orps %xmm0, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm14 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: movdqa (%r9), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa (%rdx), %xmm13 ; SSE-NEXT: movdqa (%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm12 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm10, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm9[0,2] -; SSE-NEXT: andps %xmm7, %xmm12 -; SSE-NEXT: orps %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,2,2,2] -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2] +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: orps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pandn %xmm14, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,2,2] +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm10[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm10[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm5[2,0] -; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,5,6,6,7] -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; SSE-NEXT: andnps %xmm9, %xmm15 -; SSE-NEXT: orps %xmm4, %xmm15 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] -; SSE-NEXT: andnps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,6,7] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm7[1,1] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm5, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: orps %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,2],xmm13[1,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: andnps %xmm15, %xmm2 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1],mem[0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm13 -; SSE-NEXT: orps %xmm15, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[0,1] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1] -; SSE-NEXT: andps %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm9[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: andps %xmm5, %xmm14 -; SSE-NEXT: por %xmm14, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: andps %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,2],xmm11[1,1] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: andnps %xmm9, %xmm6 +; SSE-NEXT: orps %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm12, %xmm9 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm12[2,1] +; SSE-NEXT: andps %xmm1, %xmm11 +; SSE-NEXT: orps %xmm9, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,1] +; SSE-NEXT: andps %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: andps %xmm4, %xmm13 +; SSE-NEXT: por %xmm13, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: andps %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm5, 112(%rax) -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: movdqa %xmm0, 176(%rax) -; SSE-NEXT: movaps %xmm13, 64(%rax) -; SSE-NEXT: movaps %xmm2, 32(%rax) +; SSE-NEXT: movdqa %xmm4, 112(%rax) +; SSE-NEXT: movdqa %xmm5, (%rax) +; SSE-NEXT: movdqa %xmm1, 176(%rax) +; SSE-NEXT: movaps %xmm11, 64(%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1533,10 +1534,10 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movdqa %xmm11, 96(%rax) +; SSE-NEXT: movdqa %xmm8, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $232, %rsp +; SSE-NEXT: addq $216, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf16: @@ -1545,70 +1546,70 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm9[1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1,2,3,4,5,6],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm2[6],xmm10[7] ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm13[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 @@ -1616,31 +1617,31 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5],xmm7[6],xmm3[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5],xmm10[6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm4[0,2],xmm12[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,2],xmm12[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm1 @@ -1649,30 +1650,30 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[0,2],xmm13[1,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm11[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,2],xmm13[1,3] ; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] @@ -1699,7 +1700,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -1709,12 +1710,12 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] @@ -1724,7 +1725,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm2[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 @@ -1743,7 +1744,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,u,u,u,u,u,u,u,u,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 @@ -1770,26 +1771,28 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: pushq %rax +; AVX2-SLOW-NEXT: subq $40, %rsp ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 @@ -1797,6 +1800,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = @@ -1807,23 +1811,23 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 @@ -1835,51 +1839,54 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm12 +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm14 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7,8,9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] @@ -1896,63 +1903,68 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 192(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: popq %rax +; AVX2-SLOW-NEXT: addq $40, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2035,8 +2047,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 @@ -2044,8 +2056,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm11 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] @@ -2055,11 +2067,11 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3,4],xmm1[5],xmm9[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] @@ -2094,7 +2106,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] @@ -2119,7 +2131,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax) ; AVX2-FAST-NEXT: popq %rax ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -2133,18 +2145,20 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = @@ -2152,7 +2166,6 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> @@ -2167,9 +2180,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = @@ -2185,8 +2198,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,3] @@ -2205,21 +2218,22 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> @@ -2232,7 +2246,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> @@ -2244,20 +2258,20 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7,8,9],ymm1[10],ymm11[11,12],ymm1[13],ymm11[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6] @@ -2265,19 +2279,19 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] @@ -2290,7 +2304,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2649,7 +2663,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $696, %rsp # imm = 0x2B8 +; SSE-NEXT: subq $680, %rsp # imm = 0x2A8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2657,30 +2671,31 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm1 ; SSE-NEXT: movdqa 48(%rcx), %xmm5 -; SSE-NEXT: movdqa 48(%r8), %xmm6 +; SSE-NEXT: movdqa 48(%r8), %xmm9 ; SSE-NEXT: movdqa 48(%r9), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rax), %xmm9 +; SSE-NEXT: movaps 48(%rax), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm0 @@ -2688,473 +2703,469 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm14, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: andnps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa (%r8), %xmm0 +; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm11 -; SSE-NEXT: movdqa (%r9), %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: andnps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm10 +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andnps %xmm12, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 ; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm1 -; SSE-NEXT: movdqa 32(%r9), %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 32(%r8), %xmm10 +; SSE-NEXT: movdqa 32(%r9), %xmm9 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 32(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 32(%rcx), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa 32(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andnps %xmm0, %xmm14 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm13[3,3] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm10, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm10 -; SSE-NEXT: orps %xmm4, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm5, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm5 +; SSE-NEXT: orps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm6[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm14 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[1,1] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm5[1,1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: andnps %xmm2, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] @@ -3162,22 +3173,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: andnps %xmm1, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm8 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm10[1,1] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: andnps %xmm2, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] @@ -3185,21 +3195,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,6,6] +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[1,1] -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: andnps %xmm1, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm5 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm12, %xmm10 +; SSE-NEXT: andnps %xmm1, %xmm10 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] @@ -3217,13 +3228,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] -; SSE-NEXT: andnps %xmm1, %xmm6 -; SSE-NEXT: orps %xmm0, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] +; SSE-NEXT: andnps %xmm1, %xmm12 +; SSE-NEXT: orps %xmm0, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0] @@ -3236,7 +3246,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3244,117 +3254,114 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm11 +; SSE-NEXT: orps %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movapd %xmm7, %xmm13 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm13 +; SSE-NEXT: orps %xmm1, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[1],mem[0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[2,1] -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm15 -; SSE-NEXT: orps %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,1] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: andps %xmm6, %xmm8 +; SSE-NEXT: por %xmm8, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm4, %xmm10 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: andps %xmm6, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm4, 336(%rax) -; SSE-NEXT: movdqa %xmm12, 224(%rax) -; SSE-NEXT: movdqa %xmm14, 112(%rax) -; SSE-NEXT: movdqa %xmm9, (%rax) -; SSE-NEXT: movdqa %xmm0, 288(%rax) -; SSE-NEXT: movaps %xmm15, 176(%rax) -; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movdqa %xmm6, 336(%rax) +; SSE-NEXT: movdqa %xmm9, 224(%rax) +; SSE-NEXT: movdqa %xmm8, 112(%rax) +; SSE-NEXT: movdqa %xmm1, (%rax) +; SSE-NEXT: movdqa %xmm2, 288(%rax) +; SSE-NEXT: movaps %xmm13, 176(%rax) +; SSE-NEXT: movaps %xmm11, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm6, 368(%rax) +; SSE-NEXT: movaps %xmm12, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3362,7 +3369,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) -; SSE-NEXT: movaps %xmm5, 256(%rax) +; SSE-NEXT: movaps %xmm10, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3370,7 +3377,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm8, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3382,7 +3390,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 320(%rax) +; SSE-NEXT: movdqa %xmm15, 320(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3393,77 +3401,77 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 400(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%rax) -; SSE-NEXT: addq $696, %rsp # imm = 0x2B8 +; SSE-NEXT: addq $680, %rsp # imm = 0x2A8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 +; AVX1-ONLY-NEXT: subq $584, %rsp # imm = 0x248 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm9 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm13 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -3474,34 +3482,34 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 @@ -3513,72 +3521,71 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5],xmm5[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm1[3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm5[0,2],xmm1[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm6[3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm5[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -3587,103 +3594,101 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm12[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm11[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm15[0,2],xmm10[1,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 @@ -3691,12 +3696,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[0,2],xmm13[1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm13[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 @@ -3722,213 +3727,213 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm1[3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm2[3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vpermilps $80, (%rsp), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $600, %rsp # imm = 0x258 +; AVX1-ONLY-NEXT: addq $584, %rsp # imm = 0x248 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $600, %rsp # imm = 0x258 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX2-SLOW-NEXT: subq $616, %rsp # imm = 0x268 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 @@ -3956,27 +3961,27 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> @@ -3987,13 +3992,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4002,7 +4007,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4013,115 +4018,113 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm15, %ymm14, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm6[2],xmm15[3,4],xmm6[5],xmm15[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm3[2],xmm12[3,4],xmm3[5],xmm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2],xmm14[3,4],xmm3[5],xmm14[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm15 -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -4131,7 +4134,9 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 @@ -4251,48 +4256,53 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, 224(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm14, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: addq $600, %rsp # imm = 0x258 +; AVX2-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $328, %rsp # imm = 0x148 +; AVX2-FAST-NEXT: subq $312, %rsp # imm = 0x138 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> @@ -4301,21 +4311,26 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] @@ -4324,132 +4339,129 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1],ymm2[2],ymm15[3,4],ymm2[5],ymm15[6,7,8,9],ymm2[10],ymm15[11,12],ymm2[13],ymm15[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8,9,10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8,9,10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8,9,10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm15, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = @@ -4457,18 +4469,18 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 @@ -4476,192 +4488,195 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2,3],xmm8[4],xmm13[5,6],xmm8[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2],xmm4[3,4],xmm14[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1],xmm4[2],xmm11[3,4],xmm4[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1],xmm8[2],xmm13[3,4],xmm8[5],xmm13[6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-FAST-NEXT: addq $312, %rsp # imm = 0x138 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm11, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm15, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 @@ -4687,13 +4702,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] @@ -4705,8 +4720,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4716,11 +4731,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4729,7 +4744,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4743,107 +4758,106 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm15, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm9[2],xmm15[3,4],xmm9[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -4853,7 +4867,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm5 @@ -4908,31 +4923,31 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm7 @@ -4953,23 +4968,23 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 256(%rax) @@ -4983,997 +4998,1008 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride7_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $792, %rsp # imm = 0x318 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: subq $824, %rsp # imm = 0x338 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm2 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,7,6] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vprold $16, %ymm14, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermi2d %zmm11, %zmm10, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7,8,9,10],ymm0[11],ymm6[12,13],ymm0[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] -; AVX512F-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm15 -; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7,8,9],ymm11[10],ymm9[11,12],ymm11[13],ymm9[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm31 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm9, %ymm14, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7,8,9],ymm14[10],ymm3[11,12],ymm14[13],ymm3[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8,9,10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-SLOW-NEXT: vprold $16, %ymm13, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7,8,9],ymm4[10],ymm12[11,12],ymm4[13],ymm12[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm4, %ymm6, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm17[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm12, %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8,9,10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm31 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7,8,9],ymm10[10],ymm6[11,12],ymm10[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6,7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7,8],ymm1[9],ymm6[10,11],ymm1[12],ymm6[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-SLOW-NEXT: vprold $16, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm9 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm11 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm26[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7,8,9,10],ymm0[11],ymm12[12,13],ymm0[14],ymm12[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8,9,10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vprold $16, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm4[1],xmm13[2,3],xmm4[4],xmm13[5,6],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm26[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm26[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm31[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm18[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-SLOW-NEXT: vprold $16, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7,8,9],ymm15[10],ymm4[11,12],ymm15[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm6 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm27[0,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm19[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6,7,8],ymm9[9],ymm0[10,11],ymm9[12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm18[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm16[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm13[2,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5,6,7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm22, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm7, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3 -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm14 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm19 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm11, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm18 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm19 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm21 = mem[2,1,3,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm20[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm17[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm26[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm31 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm27, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm11[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm13[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm9 -; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm26, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm22, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm15[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm4, %zmm10 +; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm31[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm15[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm24, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm25, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm20, %zmm9 +; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm31, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm6 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%rax) -; AVX512F-SLOW-NEXT: addq $792, %rsp # imm = 0x318 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-SLOW-NEXT: addq $824, %rsp # imm = 0x338 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: subq $744, %rsp # imm = 0x2E8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm7, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm3, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,1,1,12,13,u,15> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm9, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,u,3,10,10,11,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm19, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm10, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm29, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm6, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7,8],ymm5[9],ymm9[10,11],ymm5[12],ymm9[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7,8,9],ymm15[10],ymm2[11,12],ymm15[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,u,u,u,7,u,u,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm27, %ymm14, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm16[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, (%rsp), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm26[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm17[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm21[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm20[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm25[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm18[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm30, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm29, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: subq $744, %rsp # imm = 0x2E8 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 -; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm7 +; AVX512DQ-FAST-NEXT: vporq %ymm6, %ymm7, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm3, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,1,1,12,13,u,15> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vprold $16, %ymm9, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm9, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm28 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,u,3,10,10,11,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm19, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm9 -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] +; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm10, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm7 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm29, %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm6, %xmm6 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7,8],ymm5[9],ymm9[10,11],ymm5[12],ymm9[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7,8,9],ymm15[10],ymm2[11,12],ymm15[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,u,u,u,7,u,u,7> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm14, %ymm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm16[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm29 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, (%rsp), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,2,2,3] ; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm19 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19 +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm26[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm17[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm21[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm20[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm25[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm18[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm30, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm29, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm4 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-FAST-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -6135,1449 +6161,1472 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1656, %rsp # imm = 0x678 +; SSE-NEXT: subq $1640, %rsp # imm = 0x668 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdx), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm1 ; SSE-NEXT: movdqa 96(%rcx), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rcx), %xmm8 -; SSE-NEXT: movdqa 112(%r8), %xmm5 -; SSE-NEXT: movdqa 112(%r9), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rax), %xmm11 +; SSE-NEXT: movdqa 112(%rcx), %xmm6 +; SSE-NEXT: movdqa 112(%r8), %xmm4 +; SSE-NEXT: movdqa 112(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rax), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: andnps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: andps %xmm1, %xmm4 +; SSE-NEXT: andnps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: orps %xmm4, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa 96(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa 96(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movdqa 96(%rax), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%r9), %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm12, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%r8), %xmm12 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm8, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%r8), %xmm1 -; SSE-NEXT: movdqa 32(%r9), %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm14 +; SSE-NEXT: movdqa 16(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 48(%r8), %xmm0 -; SSE-NEXT: movdqa 48(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 32(%r8), %xmm14 +; SSE-NEXT: movdqa 32(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 48(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 48(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 48(%rsi), %xmm6 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 32(%rcx), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa 32(%rdx), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 48(%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%r9), %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 48(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 64(%r8), %xmm1 -; SSE-NEXT: movdqa 64(%r9), %xmm3 +; SSE-NEXT: movdqa 64(%r9), %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 64(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 64(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 64(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa 64(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa 64(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm9, %xmm0 ; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 80(%rax), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa 80(%r8), %xmm1 -; SSE-NEXT: movdqa 80(%r9), %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa 80(%r9), %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 80(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 80(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 80(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa 80(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 80(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[3,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm6, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm4, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd (%rsp), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[3,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: orps %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: andps %xmm5, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: andps %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,3] +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[1,1] -; SSE-NEXT: movaps %xmm5, %xmm12 -; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1] +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] -; SSE-NEXT: movaps %xmm14, %xmm11 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] +; SSE-NEXT: andnps %xmm1, %xmm5 +; SSE-NEXT: orps %xmm0, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0],mem[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm2, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm9, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE-NEXT: andnps %xmm1, %xmm5 +; SSE-NEXT: orps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2,2],mem[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm5, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: andnps %xmm0, %xmm5 +; SSE-NEXT: orps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movapd %xmm10, %xmm5 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm5 +; SSE-NEXT: orps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm12 +; SSE-NEXT: orps %xmm1, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm6 +; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1],mem[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm11 +; SSE-NEXT: orps %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[1,1] -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: andps %xmm9, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: andnps %xmm1, %xmm9 -; SSE-NEXT: orps %xmm7, %xmm9 -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: andps %xmm3, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm9, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movaps %xmm12, %xmm6 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm6 -; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movapd %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: movaps %xmm14, %xmm9 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm9 ; SSE-NEXT: orps %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm8 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm8 -; SSE-NEXT: orps %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm7 ; SSE-NEXT: orps %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movapd %xmm14, %xmm5 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm5 ; SSE-NEXT: orps %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,1] -; SSE-NEXT: andps %xmm0, %xmm3 -; SSE-NEXT: orps %xmm6, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,1] -; SSE-NEXT: andps %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: andps %xmm2, %xmm10 -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: andps %xmm4, %xmm13 +; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: andps %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: andps %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: andps %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 672(%rax) +; SSE-NEXT: movdqa %xmm4, 672(%rax) ; SSE-NEXT: movdqa %xmm15, 560(%rax) -; SSE-NEXT: movdqa %xmm10, 448(%rax) -; SSE-NEXT: movdqa %xmm11, 336(%rax) -; SSE-NEXT: movdqa %xmm12, 224(%rax) -; SSE-NEXT: movdqa %xmm13, 112(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: movdqa %xmm0, 736(%rax) -; SSE-NEXT: movaps %xmm3, 624(%rax) -; SSE-NEXT: movaps %xmm5, 512(%rax) -; SSE-NEXT: movaps %xmm7, 400(%rax) -; SSE-NEXT: movaps %xmm8, 288(%rax) -; SSE-NEXT: movaps %xmm9, 176(%rax) +; SSE-NEXT: movdqa %xmm13, 448(%rax) +; SSE-NEXT: movdqa %xmm3, 336(%rax) +; SSE-NEXT: movdqa %xmm6, 224(%rax) +; SSE-NEXT: movdqa %xmm8, 112(%rax) +; SSE-NEXT: movdqa %xmm10, (%rax) +; SSE-NEXT: movdqa %xmm2, 736(%rax) +; SSE-NEXT: movaps %xmm5, 624(%rax) +; SSE-NEXT: movaps %xmm7, 512(%rax) +; SSE-NEXT: movaps %xmm9, 400(%rax) +; SSE-NEXT: movaps %xmm11, 288(%rax) +; SSE-NEXT: movaps %xmm12, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7664,144 +7713,145 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: addq $1656, %rsp # imm = 0x678 +; SSE-NEXT: addq $1640, %rsp # imm = 0x668 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0],xmm3[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2],xmm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm2[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm7[1,2,3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm11[2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 ; AVX1-ONLY-NEXT: vandps %ymm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm15 ; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5],xmm12[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm2[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,5,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1,2,3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2,3,4,5,6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa 96(%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm4[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm3[3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 @@ -7824,9 +7874,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7841,9 +7891,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm5 @@ -7859,29 +7909,28 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm4[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 @@ -7896,27 +7945,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,2],xmm3[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[0,2],xmm3[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] @@ -7934,8 +7982,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] @@ -7943,7 +7992,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] @@ -7956,264 +8005,268 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm3, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm8[6],xmm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6],xmm7[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,2],xmm4[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm8[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm15[0,2],xmm11[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm8[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm11[3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm13, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa 64(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,2],xmm6[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm4, %ymm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rax), %xmm0 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rax), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[0,2],xmm0[1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm15[6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 @@ -8221,33 +8274,34 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm1[3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] @@ -8255,44 +8309,43 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm14[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3,4],xmm13[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,0,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm14[0,2],xmm0[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 @@ -8308,8 +8361,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8326,9 +8379,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm6 @@ -8357,11 +8410,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm8, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm13, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8378,17 +8431,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm12 @@ -8404,28 +8457,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm3[1],xmm12[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload @@ -8446,14 +8498,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8470,11 +8522,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] @@ -8487,67 +8539,70 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm4[1],xmm11[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm3[1],xmm11[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8664,13 +8719,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, 848(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 832(%rax) -; AVX1-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX1-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1720, %rsp # imm = 0x6B8 +; AVX2-SLOW-NEXT: subq $1688, %rsp # imm = 0x698 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8680,120 +8735,121 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm9 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4> +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm11, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm11, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm11, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <3,u,u,u,4,u,u,4> +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] @@ -8819,16 +8875,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -8850,13 +8905,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -8881,255 +8936,259 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm15, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd $165, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm8 -; AVX2-SLOW-NEXT: vpshufd $165, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7] -; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3],xmm1[4],xmm14[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[1,1,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm8, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3],xmm4[4],xmm14[5,6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = mem[1,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm8, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] @@ -9153,94 +9212,118 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5] @@ -9249,210 +9332,189 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm7, %ymm8 -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8,9,10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8,9,10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15] +; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm5[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,3,3,6,7,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8,9,10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] -; AVX2-SLOW-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8,9,10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7,8,9],ymm15[10],ymm1[11,12],ymm15[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7,8,9],ymm15[10],ymm2[11,12],ymm15[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,3,3,3,6,7,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm14, %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 544(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 320(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 640(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 608(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 640(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 608(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 416(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9489,23 +9551,23 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-SLOW-NEXT: addq $1720, %rsp # imm = 0x6B8 +; AVX2-SLOW-NEXT: addq $1688, %rsp # imm = 0x698 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1240, %rsp # imm = 0x4D8 +; AVX2-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] @@ -9513,13 +9575,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm1 @@ -9535,305 +9597,314 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7,8,9],ymm9[10],ymm1[11,12],ymm9[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7,8,9],ymm3[10],ymm14[11,12],ymm3[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm13, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7,8,9],ymm14[10],ymm4[11,12],ymm14[13],ymm4[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7,8,9],ymm14[10],ymm3[11,12],ymm14[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7,8,9],ymm14[10],ymm0[11,12],ymm14[13],ymm0[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7,8,9],ymm14[10],ymm2[11,12],ymm14[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8,9,10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3],ymm11[4,5],ymm1[6],ymm11[7,8,9,10],ymm1[11],ymm11[12,13],ymm1[14],ymm11[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,u,u,u,4,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,u,u,3,u,u,u,4> @@ -9851,15 +9922,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] @@ -9882,9 +9953,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] @@ -9904,15 +9975,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastd 124(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -9926,317 +9998,315 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm9 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm13 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 64(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 96(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 64(%rax), %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm11 +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 96(%rax), %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm7, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpshufd $165, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4],xmm2[5],xmm6[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 68(%rax), %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshufd $165, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 100(%rax), %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm12, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3],xmm1[4],xmm11[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1],xmm6[2],xmm11[3,4],xmm6[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 68(%rax), %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 100(%rax), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 72(%rax), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 104(%rax), %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 72(%rax), %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 104(%rax), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 544(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 640(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 608(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 576(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 384(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 768(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 736(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 704(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 672(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 544(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 640(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 608(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 576(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 416(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 384(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 768(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 736(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 704(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 512(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 480(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 448(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 288(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10247,7 +10317,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-FAST-NEXT: addq $1240, %rsp # imm = 0x4D8 +; AVX2-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -10256,189 +10326,188 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <3,u,u,u,4,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm7, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -10451,7 +10520,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -10463,243 +10532,241 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rax), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm15, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm15, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm9[2],xmm15[3,4],xmm9[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm9[1],xmm15[2,3],xmm9[4],xmm15[5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm9, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3],xmm0[4],xmm13[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1],xmm3[2],xmm13[3,4],xmm3[5],xmm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3],xmm1[4],xmm13[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm5[2],xmm13[3,4],xmm5[5],xmm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] @@ -10723,17 +10790,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 104(%rax), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] @@ -10762,45 +10829,46 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm14, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7,8,9,10],ymm11[11],ymm7[12,13],ymm11[14],ymm7[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -10808,12 +10876,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] @@ -10821,61 +10889,62 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm14, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -10883,74 +10952,74 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8,9,10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7,8,9],ymm1[10],ymm15[11,12],ymm1[13],ymm15[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7] @@ -11018,570 +11087,562 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride7_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $2440, %rsp # imm = 0x988 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: subq $2456, %rsp # imm = 0x998 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm16 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm3 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm10, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm9, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm13 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm11 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vprold $16, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,3,2,10,10,10,11] -; AVX512F-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm0 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm0[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm15[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-SLOW-NEXT: vprold $16, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm19, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[14,15],zero,zero,ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17],zero,zero,ymm8[u,u],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm15[2,1,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm11[2,2,2,2] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512F-SLOW-NEXT: vpor %ymm15, %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,1,3,2,10,10,10,11] +; AVX512F-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8,9,10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm13 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,1,2,3,6,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-SLOW-NEXT: vprold $16, %ymm13, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm12 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm20, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15],zero,zero,ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17],zero,zero,ymm7[u,u],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm14, %ymm4, %ymm7 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm12 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,1,3,3] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm8[2,2,2,2] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm12[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,2,2,4,5,6,6] ; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[2,3,3,3,6,7,7,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm19, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm15 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpandnq %ymm11, %ymm10, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm30 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm20, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX512F-SLOW-NEXT: vpandnq %ymm10, %ymm19, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpandn %ymm7, %ymm14, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpandnq %ymm4, %ymm25, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm12 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm25, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm15[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpandn %ymm13, %ymm14, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm15, %ymm29 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7,8,9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7,8,9,10],ymm8[11],ymm0[12,13],ymm8[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-SLOW-NEXT: vprold $16, %ymm31, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7,8,9,10],ymm8[11],ymm2[12,13],ymm8[14],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] -; AVX512F-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512F-SLOW-NEXT: vprold $16, %ymm18, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] +; AVX512F-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm11, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,3,3,6,7,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm18, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vprold $16, %xmm1, %xmm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vprold $16, %xmm4, %xmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512F-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512F-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vprold $16, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm6, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm2 +; AVX512F-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vprold $16, %ymm4, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm12 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm16 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vprold $16, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm27, %zmm28 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm13, %zmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm30 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm5, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vprold $16, %xmm5, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm7[2],xmm13[3,4],xmm7[5],xmm13[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm30 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-SLOW-NEXT: vprold $16, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 ; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm13[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm2[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm7[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[2,2,2,3] -; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm1[1],xmm15[2,3],xmm1[4],xmm15[5,6],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm27[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[2,2,2,3] -; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm31 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm4[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm15 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm4[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm6[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm6[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm31[2,1,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm2, %zmm12 +; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm31 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm12[1],xmm15[2,3],xmm12[4],xmm15[5,6],xmm12[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm29[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm17 = ymm29[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm29 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm8 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm11 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm25[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm23[2,2,2,3] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm1 = mem[2,1,3,2] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 @@ -11592,32 +11653,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm29[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm4[0,1,2,3],zmm23[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm19 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm25, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm19 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm20 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm10, %zmm2, %zmm20 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm25, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm2, %zmm10 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm2, %zmm26 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm23 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm10 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm22 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm16[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm27 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] ; AVX512F-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm1, %ymm0 @@ -11627,811 +11688,797 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm1 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm15[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm14[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 +; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm16 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm18[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm17[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm17 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm24, %zmm4 -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm21 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm7 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm19, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm1 +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw $180, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = mem[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] ; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm8 = mem[0,0,1,1] ; AVX512F-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm11 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm15 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = mem[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3] ; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm18 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = mem[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm30 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm7 = mem[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm30 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm25, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm15, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm25, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm31 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm27 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm17 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 256(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 832(%rax) -; AVX512F-SLOW-NEXT: addq $2440, %rsp # imm = 0x988 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) +; AVX512F-SLOW-NEXT: addq $2456, %rsp # imm = 0x998 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm4, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm12, %ymm11, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm12, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm17, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <5,u,u,u,6,u,u,6> +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero,ymm10[u,u],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm26, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm24, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm9, %ymm23, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,1,1,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm27, %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm10, %ymm17, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm17, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm27, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7,8,9,10],ymm10[11],ymm0[12,13],ymm10[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,u,3,10,10,11,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm10, %ymm23, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm16 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm16, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3],xmm3[4],xmm7[5,6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm15, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm5, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm8, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm2, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7,8,9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm29, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm0[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm16[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm10[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm12[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm14[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm28[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm26[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm23[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm21[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm10, %zmm8, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm10, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm22 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm20, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm25 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm26 = mem[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm26 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,1,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[2,1,3,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload @@ -12442,44 +12489,45 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,0,1,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm3 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -12487,684 +12535,669 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm3, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm3, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm3, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm3, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512DQ-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vprold $16, %ymm4, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vporq %ymm12, %ymm11, %ymm22 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <5,u,u,u,6,u,u,6> +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX512DQ-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQ-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[14,15],zero,zero,ymm10[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[16,17],zero,zero,ymm10[u,u],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 -; AVX512DQ-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm9 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm8 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm26, %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm24, %ymm9 +; AVX512DQ-FAST-NEXT: vpandnq %ymm9, %ymm23, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm8 +; AVX512DQ-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm16 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,1,1,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm27, %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vpandnq %ymm10, %ymm17, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm17, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm27, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7,8,9,10],ymm10[11],ymm0[12,13],ymm10[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,u,3,10,10,11,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 -; AVX512DQ-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm10 +; AVX512DQ-FAST-NEXT: vpandnq %ymm10, %ymm23, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm16 ; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm16, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm13 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm3, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm13 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm23 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm22 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3],xmm3[4],xmm7[5,6],xmm3[7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm15, %xmm30 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm13, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX512DQ-FAST-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm5, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm12 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7,8,9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm29, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm11, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm31 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm30 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm24 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm20 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm0[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,2,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm30 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm29 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm16[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm10[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm12[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm14[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm28[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm26[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm23[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm21[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm8, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm7 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm21 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm22 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm20, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm25 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,1,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,1,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,1,3] ; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm15 = mem[2,1,3,3] ; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload @@ -13175,44 +13208,45 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,0,1,3] ; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm3 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm14 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQ-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -13221,223 +13255,222 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $136, %rsp ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm30 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm0, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm6, %zmm3 ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u> -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm13, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm3 ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm28, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm30, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm7 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm15 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm24, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm28, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm30, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512BW-NEXT: vpermt2w %zmm29, %zmm23, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm26 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm30 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm14, %zmm31 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm26 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm10, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vpermi2w %zmm9, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 ; AVX512BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm7 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm4, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm16 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 ; AVX512BW-NEXT: movl $202911840, %eax # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm9, %zmm12, %zmm31 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm24 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 ; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm31, %zmm18 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 ; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm4, %zmm29 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm12, %zmm17 -; AVX512BW-NEXT: vmovdqu16 %zmm29, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm2, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm17 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 ; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm18 ; AVX512BW-NEXT: movl $405823681, %eax # imm = 0x183060C1 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm22 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm4, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm21, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 ; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm28 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm2 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} ; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm19, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm19, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512BW-NEXT: addq $136, %rsp diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 36c79fa995dcc..e5a6eea44b8ad 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -538,7 +538,7 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%rcx), %xmm11 ; SSE-NEXT: movdqa (%r8), %xmm4 ; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa (%r10), %xmm2 +; SSE-NEXT: movdqa (%r10), %xmm3 ; SSE-NEXT: movdqa (%rax), %xmm10 ; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] @@ -546,14 +546,14 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] ; SSE-NEXT: movdqa %xmm4, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] @@ -574,14 +574,14 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] ; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] @@ -590,10 +590,10 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -604,7 +604,7 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm7, 32(%rax) ; SSE-NEXT: movaps %xmm6, 48(%rax) ; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movapd %xmm3, (%rax) +; SSE-NEXT: movapd %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf8: @@ -849,20 +849,19 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride8_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp +; SSE-NEXT: subq $88, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa (%rsi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm7 ; SSE-NEXT: movdqa (%rcx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm8 ; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa (%r10), %xmm10 +; SSE-NEXT: movdqa (%r10), %xmm11 ; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movdqa %xmm8, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] @@ -873,9 +872,9 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm7, %xmm6 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] @@ -885,142 +884,145 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; SSE-NEXT: movdqa 16(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movdqa 16(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%r10), %xmm6 -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: movdqa 16(%r10), %xmm15 +; SSE-NEXT: movdqa 16(%rax), %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] ; SSE-NEXT: movdqa 16(%r8), %xmm4 ; SSE-NEXT: movdqa 16(%r9), %xmm11 ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm13[0],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm9[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm1, 224(%rax) ; SSE-NEXT: movaps %xmm3, 240(%rax) -; SSE-NEXT: movapd %xmm8, 160(%rax) -; SSE-NEXT: movaps %xmm9, 176(%rax) +; SSE-NEXT: movapd %xmm10, 160(%rax) +; SSE-NEXT: movaps %xmm8, 176(%rax) ; SSE-NEXT: movapd %xmm13, 96(%rax) ; SSE-NEXT: movaps %xmm12, 112(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movapd %xmm14, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movapd %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm6, 48(%rax) +; SSE-NEXT: movapd %xmm9, 192(%rax) +; SSE-NEXT: movaps %xmm11, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) @@ -1028,7 +1030,7 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: addq $88, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf16: @@ -1036,138 +1038,138 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm14, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] @@ -1208,37 +1210,35 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-SLOW-LABEL: store_i16_stride8_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: pushq %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm11 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] @@ -1251,26 +1251,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero @@ -1282,64 +1282,63 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 224(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1348,7 +1347,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1357,123 +1355,123 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: pushq %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3],ymm2[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2,3,4],ymm2[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm15 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm14[4],ymm12[5],ymm14[5],ymm12[6],ymm14[6],ymm12[7],ymm14[7],ymm12[12],ymm14[12],ymm12[13],ymm14[13],ymm12[14],ymm14[14],ymm12[15],ymm14[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[8],ymm6[8],ymm14[9],ymm6[9],ymm14[10],ymm6[10],ymm14[11],ymm6[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1488,37 +1486,35 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride8_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: pushq %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] @@ -1531,26 +1527,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero @@ -1562,64 +1558,63 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1628,7 +1623,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: popq %rax ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1636,56 +1630,55 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vmovdqa (%rdx), %ymm9 ; AVX512F-NEXT: vmovdqa (%rcx), %ymm10 ; AVX512F-NEXT: vmovdqa (%r8), %ymm15 ; AVX512F-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-NEXT: vmovdqa (%r10), %ymm6 -; AVX512F-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-NEXT: vmovdqa (%r10), %xmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm16 -; AVX512F-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-NEXT: vmovdqa (%r10), %ymm4 +; AVX512F-NEXT: vmovdqa (%rax), %ymm1 +; AVX512F-NEXT: vmovdqa (%rax), %xmm5 +; AVX512F-NEXT: vmovdqa (%r10), %xmm6 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa (%r9), %xmm5 ; AVX512F-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm20 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqa (%rcx), %xmm11 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm17 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 ; AVX512F-NEXT: vmovdqa (%rsi), %xmm13 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm18 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm19 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] ; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15] -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] ; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512F-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 @@ -1695,25 +1688,25 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27> ; AVX512F-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u> -; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 ; AVX512F-NEXT: movb $-86, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] +; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] ; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm14, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} ; AVX512F-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1784,150 +1777,151 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $264, %rsp # imm = 0x108 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm4 ; SSE-NEXT: movdqa (%rcx), %xmm10 -; SSE-NEXT: movdqa (%r8), %xmm5 +; SSE-NEXT: movdqa (%r8), %xmm6 ; SSE-NEXT: movdqa (%r9), %xmm9 ; SSE-NEXT: movdqa (%r10), %xmm7 ; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%r8), %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; SSE-NEXT: movdqa 16(%r9), %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: movdqa 16(%r9), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movdqa 16(%r10), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: movdqa 16(%rax), %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: movdqa 16(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 ; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r10), %xmm0 ; SSE-NEXT: movdqa 32(%rax), %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 @@ -1937,21 +1931,22 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 ; SSE-NEXT: movdqa 32(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm10 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -1962,16 +1957,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -1998,54 +1993,54 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: movdqa 48(%r10), %xmm7 +; SSE-NEXT: movdqa 48(%r10), %xmm9 ; SSE-NEXT: movdqa 48(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 48(%r8), %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 48(%r8), %xmm4 ; SSE-NEXT: movdqa 48(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 48(%rdx), %xmm6 ; SSE-NEXT: movdqa 48(%rcx), %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rsi), %xmm12 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm7[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] @@ -2053,26 +2048,26 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 496(%rax) -; SSE-NEXT: movapd %xmm4, 480(%rax) +; SSE-NEXT: movapd %xmm5, 480(%rax) ; SSE-NEXT: movaps %xmm3, 464(%rax) ; SSE-NEXT: movapd %xmm1, 448(%rax) -; SSE-NEXT: movaps %xmm8, 432(%rax) -; SSE-NEXT: movapd %xmm9, 416(%rax) +; SSE-NEXT: movaps %xmm7, 432(%rax) +; SSE-NEXT: movapd %xmm8, 416(%rax) ; SSE-NEXT: movaps %xmm10, 400(%rax) ; SSE-NEXT: movapd %xmm11, 384(%rax) ; SSE-NEXT: movaps %xmm14, 368(%rax) @@ -2183,21 +2178,21 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] @@ -2221,17 +2216,17 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2240,36 +2235,36 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] @@ -2289,105 +2284,105 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm13[0],zero,xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm13[0],zero,xmm13[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3],ymm2[4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -2429,8 +2424,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 288(%rax) @@ -2508,27 +2503,27 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] @@ -2547,8 +2542,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero @@ -2613,123 +2608,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 224(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 192(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2763,73 +2758,74 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,2,3,3,3,3,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 @@ -2837,40 +2833,40 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,1,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm9 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -2878,137 +2874,138 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm9 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm13[2,3],ymm2[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3091,27 +3088,27 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] @@ -3130,8 +3127,8 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero @@ -3196,123 +3193,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3345,145 +3342,145 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm30 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm26, %zmm30 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm27, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm28, %zmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm28, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm7 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm18, %zmm31 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm19, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm19, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm20, %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm20, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm21, %zmm15 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm12 -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm19, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm20, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm21, %zmm17 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm28, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm23 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm26, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm27, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm28, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm25 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm19, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm18, %zmm17 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm28, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm22 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm26, %zmm25 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm27, %zmm25 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm28, %zmm24 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm24 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm19, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm19, %zmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm18, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm19, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm19, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[8],ymm6[8],ymm1[9],ymm6[9],ymm1[10],ymm6[10],ymm1[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] ; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm6[4],ymm1[5],ymm6[5],ymm1[6],ymm6[6],ymm1[7],ymm6[7],ymm1[12],ymm6[12],ymm1[13],ymm6[13],ymm1[14],ymm6[14],ymm1[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm20, %zmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm21, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm20, %zmm4 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm21, %zmm4 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm26, %zmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm5 {%k1} ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm28, %zmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm6 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm28, %zmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm7 {%k2} ; AVX512F-SLOW-NEXT: movb $-86, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: subq $472, %rsp # imm = 0x1D8 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm1 @@ -3495,9 +3492,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 @@ -3517,42 +3514,41 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm25 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm23 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm22 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm25 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm23 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] ; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 @@ -3561,124 +3557,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm28 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm31 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm29 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm10 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm30 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm28 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm1, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm31, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm25 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm31, %zmm17 +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm0, %zmm17 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm16 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm22 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm24 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm1, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm31, %zmm6 +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm31, %zmm7 ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm1, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm11, %zmm9 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm12 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm11, %zmm12 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm11, %zmm3 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm11, %zmm1 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm5 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm4, %zmm11 -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm6, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm6, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm16 -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm10, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm6, %zmm14 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm6, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm0, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm12, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm22 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm12, %zmm22 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm12, %zmm3 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm12, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm15 +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm5, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm16 +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm5, %zmm14 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm5, %zmm9 {%k2} ; AVX512F-FAST-NEXT: movb $-86, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-FAST-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-FAST-NEXT: addq $472, %rsp # imm = 0x1D8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -3825,149 +3820,150 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $776, %rsp # imm = 0x308 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rsi), %xmm6 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm7 +; SSE-NEXT: movdqa (%rdx), %xmm3 ; SSE-NEXT: movdqa (%rcx), %xmm9 ; SSE-NEXT: movdqa (%r8), %xmm4 ; SSE-NEXT: movdqa (%r9), %xmm10 ; SSE-NEXT: movdqa (%r10), %xmm8 ; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r8), %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3] ; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movdqa 16(%r10), %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movdqa 16(%r10), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa 16(%rax), %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm6[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r10), %xmm0 ; SSE-NEXT: movdqa 32(%rax), %xmm4 @@ -3978,21 +3974,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 ; SSE-NEXT: movdqa 32(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rsi), %xmm10 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4003,16 +4000,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4050,21 +4047,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 48(%rdx), %xmm2 ; SSE-NEXT: movdqa 48(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rsi), %xmm10 +; SSE-NEXT: movdqa 48(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4075,16 +4073,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4122,21 +4120,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 64(%rdx), %xmm2 ; SSE-NEXT: movdqa 64(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rsi), %xmm10 +; SSE-NEXT: movdqa 64(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4147,16 +4146,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4194,21 +4193,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 80(%rdx), %xmm2 ; SSE-NEXT: movdqa 80(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa 80(%rsi), %xmm10 +; SSE-NEXT: movdqa 80(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4219,16 +4219,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4266,21 +4266,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: movdqa 96(%rdx), %xmm2 ; SSE-NEXT: movdqa 96(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; SSE-NEXT: movdqa 96(%rdi), %xmm3 -; SSE-NEXT: movdqa 96(%rsi), %xmm10 +; SSE-NEXT: movdqa 96(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; SSE-NEXT: movdqa %xmm7, %xmm12 @@ -4291,16 +4292,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] @@ -4327,54 +4328,54 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: movdqa 112(%r10), %xmm7 +; SSE-NEXT: movdqa 112(%r10), %xmm9 ; SSE-NEXT: movdqa 112(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 112(%r8), %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 112(%r8), %xmm4 ; SSE-NEXT: movdqa 112(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa 112(%rdx), %xmm6 ; SSE-NEXT: movdqa 112(%rcx), %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa 112(%rsi), %xmm12 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm7[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] @@ -4382,26 +4383,26 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 1008(%rax) -; SSE-NEXT: movapd %xmm4, 992(%rax) +; SSE-NEXT: movapd %xmm5, 992(%rax) ; SSE-NEXT: movaps %xmm3, 976(%rax) ; SSE-NEXT: movapd %xmm1, 960(%rax) -; SSE-NEXT: movaps %xmm8, 944(%rax) -; SSE-NEXT: movapd %xmm9, 928(%rax) +; SSE-NEXT: movaps %xmm7, 944(%rax) +; SSE-NEXT: movapd %xmm8, 928(%rax) ; SSE-NEXT: movaps %xmm10, 912(%rax) ; SSE-NEXT: movapd %xmm11, 896(%rax) ; SSE-NEXT: movaps %xmm14, 880(%rax) @@ -4535,12 +4536,12 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] @@ -4548,36 +4549,36 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm8[0],zero,xmm8[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm5[3],ymm13[4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm6[3],ymm13[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 @@ -4589,9 +4590,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 @@ -4599,11 +4600,11 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,1,0,1] @@ -4614,7 +4615,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm12 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] @@ -4622,17 +4623,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm5[3],ymm9[4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -4649,9 +4650,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] @@ -4661,32 +4662,32 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] @@ -4706,17 +4707,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3],ymm2[4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -4743,22 +4744,22 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] @@ -4782,13 +4783,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] @@ -5204,14 +5205,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] @@ -5221,20 +5222,20 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero @@ -5244,23 +5245,23 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %xmm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %xmm8 @@ -5275,13 +5276,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] @@ -5294,47 +5295,47 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero @@ -5358,16 +5359,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %xmm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 @@ -5391,13 +5392,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] @@ -5418,9 +5419,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 @@ -5435,8 +5436,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] @@ -5454,227 +5455,227 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm1[4],ymm12[5],ymm1[5],ymm12[6],ymm1[6],ymm12[7],ymm1[7],ymm12[12],ymm1[12],ymm12[13],ymm1[13],ymm12[14],ymm1[14],ymm12[15],ymm1[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[8],ymm5[8],ymm13[9],ymm5[9],ymm13[10],ymm5[10],ymm13[11],ymm5[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[8],ymm4[8],ymm9[9],ymm4[9],ymm9[10],ymm4[10],ymm9[11],ymm4[11] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[12],ymm4[12],ymm9[13],ymm4[13],ymm9[14],ymm4[14],ymm9[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa 64(%r10), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 64(%rax), %ymm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3,4],ymm1[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r10), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 96(%rax), %ymm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %ymm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4],ymm15[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 992(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 992(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 960(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 928(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 928(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 896(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 736(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 736(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5738,14 +5739,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: subq $776, %rsp # imm = 0x308 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm4 ; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -5755,103 +5756,102 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm12 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm12 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3],ymm14[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqa 64(%rax), %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -5869,19 +5869,21 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,2,2,2,u,u,3,3> ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm4 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm5 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5889,23 +5891,23 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,1,1,1,1,u,u> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5916,7 +5918,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm4 ; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm7 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm8 @@ -5925,8 +5927,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5934,6 +5935,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm3 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] @@ -5963,109 +5965,110 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm8[0],ymm15[1],ymm8[1],ymm15[2],ymm8[2],ymm15[3],ymm8[3],ymm15[8],ymm8[8],ymm15[9],ymm8[9],ymm15[10],ymm8[10],ymm15[11],ymm8[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[8],ymm4[8],ymm14[9],ymm4[9],ymm14[10],ymm4[10],ymm14[11],ymm4[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4],ymm5[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3],ymm9[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm15[4],ymm8[4],ymm15[5],ymm8[5],ymm15[6],ymm8[6],ymm15[7],ymm8[7],ymm15[12],ymm8[12],ymm15[13],ymm8[13],ymm15[14],ymm8[14],ymm15[15],ymm8[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm14[4],ymm4[4],ymm14[5],ymm4[5],ymm14[6],ymm4[6],ymm14[7],ymm4[7],ymm14[12],ymm4[12],ymm14[13],ymm4[13],ymm14[14],ymm4[14],ymm14[15],ymm4[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm15, %ymm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4],ymm1[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[8],ymm2[8],ymm15[9],ymm2[9],ymm15[10],ymm2[10],ymm15[11],ymm2[11] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm15[4],ymm12[5],ymm15[5],ymm12[6],ymm15[6],ymm12[7],ymm15[7],ymm12[12],ymm15[12],ymm12[13],ymm15[13],ymm12[14],ymm15[14],ymm12[15],ymm15[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[12],ymm2[12],ymm15[13],ymm2[13],ymm15[14],ymm2[14],ymm15[15],ymm2[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,1,3,5,7,5,7] @@ -6073,61 +6076,59 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 64(%r10), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[8],ymm15[8],ymm11[9],ymm15[9],ymm11[10],ymm15[10],ymm11[11],ymm15[11] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm15[4],ymm11[5],ymm15[5],ymm11[6],ymm15[6],ymm11[7],ymm15[7],ymm11[12],ymm15[12],ymm11[13],ymm15[13],ymm11[14],ymm15[14],ymm11[15],ymm15[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm15[4],ymm12[5],ymm15[5],ymm12[6],ymm15[6],ymm12[7],ymm15[7],ymm12[12],ymm15[12],ymm12[13],ymm15[13],ymm12[14],ymm15[14],ymm12[15],ymm15[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6144,34 +6145,35 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 96(%r10), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm13, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm13, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,4,4,4,4,6,5] @@ -6181,17 +6183,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] @@ -6307,14 +6309,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] @@ -6324,20 +6326,20 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero @@ -6347,23 +6349,23 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %xmm8 @@ -6378,13 +6380,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] @@ -6397,47 +6399,47 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero @@ -6461,16 +6463,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 @@ -6494,13 +6496,13 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] @@ -6521,9 +6523,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm4 @@ -6538,8 +6540,8 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7] @@ -6557,227 +6559,227 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm1[4],ymm12[5],ymm1[5],ymm12[6],ymm1[6],ymm12[7],ymm1[7],ymm12[12],ymm1[12],ymm12[13],ymm1[13],ymm12[14],ymm1[14],ymm12[15],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[8],ymm5[8],ymm13[9],ymm5[9],ymm13[10],ymm5[10],ymm13[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[8],ymm4[8],ymm9[9],ymm4[9],ymm9[10],ymm4[10],ymm9[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[12],ymm4[12],ymm9[13],ymm4[13],ymm9[14],ymm4[14],ymm9[15],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r10), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3,4],ymm1[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r10), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4],ymm15[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 992(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 992(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 960(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 928(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 928(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 896(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 736(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 736(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6838,661 +6840,651 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-SLOW-LABEL: store_i16_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: subq $504, %rsp # imm = 0x1F8 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm3 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm30, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm29, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm10 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[8],ymm6[8],ymm0[9],ymm6[9],ymm0[10],ymm6[10],ymm0[11],ymm6[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm14, %zmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[8],ymm5[8],ymm11[9],ymm5[9],ymm11[10],ymm5[10],ymm11[11],ymm5[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm15, %zmm4 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm5[4],ymm11[5],ymm5[5],ymm11[6],ymm5[6],ymm11[7],ymm5[7],ymm11[12],ymm5[12],ymm11[13],ymm5[13],ymm11[14],ymm5[14],ymm11[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[12],ymm6[12],ymm0[13],ymm6[13],ymm0[14],ymm6[14],ymm0[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm31 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm15, %zmm31 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm16, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm27, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm29, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm16, %zmm31 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm31 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm30, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm29, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm30, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm18, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm14, %zmm28 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm15, %zmm28 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[12],ymm6[12],ymm12[13],ymm6[13],ymm12[14],ymm6[14],ymm12[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm23 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm15, %zmm23 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm16, %zmm26 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm17, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm19, %zmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm24 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm30, %zmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm1 {%k2} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm30 -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm30 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm19, %zmm28 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm28 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[8],ymm5[8],ymm11[9],ymm5[9],ymm11[10],ymm5[10],ymm11[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm14, %zmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm16, %zmm23 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm12[0],ymm0[1],ymm12[1],ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm15, %zmm22 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[12],ymm6[12],ymm10[13],ymm6[13],ymm10[14],ymm6[14],ymm10[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm5[4],ymm11[5],ymm5[5],ymm11[6],ymm5[6],ymm11[7],ymm5[7],ymm11[12],ymm5[12],ymm11[13],ymm5[13],ymm11[14],ymm5[14],ymm11[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm14, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm15, %zmm25 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm29, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm19, %zmm25 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm25 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm21 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm17, %zmm21 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm19, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm18, %zmm27 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm19, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm29 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm14, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm15, %zmm8 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm20 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm16, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm17, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] ; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm15, %zmm7 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm16, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm27, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm10 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm13 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm30, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm19 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm30, %zmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm29, %zmm10 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm30, %zmm17 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm1, %zmm15 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm27, %zmm15 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm6 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm30, %zmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm13, %zmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm6 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm27, %zmm9 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm30, %zmm6 {%k1} ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm13, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm27, %zmm4 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm2 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm2 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm27, %zmm11 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm30, %zmm4 {%k1} ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm3 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm5 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm30, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm2 {%k1} ; AVX512F-SLOW-NEXT: movb $-86, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm21 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 576(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 832(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 896(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 896(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-SLOW-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2504, %rsp # imm = 0x9C8 +; AVX512F-FAST-NEXT: subq $2312, %rsp # imm = 0x908 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[8],ymm8[8],ymm5[9],ymm8[9],ymm5[10],ymm8[10],ymm5[11],ymm8[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm8[4],ymm5[5],ymm8[5],ymm5[6],ymm8[6],ymm5[7],ymm8[7],ymm5[12],ymm8[12],ymm5[13],ymm8[13],ymm5[14],ymm8[14],ymm5[15],ymm8[15] ; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm30 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm27 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm21 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm28 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm24 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm23 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm22 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm7 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm15 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm10 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[12],ymm6[12],ymm3[13],ymm6[13],ymm3[14],ymm6[14],ymm3[15],ymm6[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm11 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm14 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm1, %zmm31 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm1, %zmm28 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm28 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm0, %zmm25 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm1, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm3 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm24 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm1, %zmm25 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm5 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm0, %zmm27 +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm1, %zmm27 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm1, %zmm5 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm19 +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm1, %zmm19 {%k2} ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm18 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm26 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm30 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm19 -; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm12, %zmm19 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm12, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm12, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm7 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm17 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm21 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm29 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm29 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm5, %zmm23 -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm12, %zmm23 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm20 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm12, %zmm20 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm12, %zmm5 {%k2} -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm1, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm1, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm0, %zmm4 +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm1, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm28 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm29 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm23 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm10, %zmm23 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm10, %zmm20 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm10, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm12 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm10 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm13, %zmm12 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm13, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm24 -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm13, %zmm24 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm13, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm13, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm13, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm13, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm11, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm22, %zmm14 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm22, %zmm26 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm26 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm22, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm22, %zmm31 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm22, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm22, %zmm18 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm10 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm22, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm22 +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm11, %zmm10 {%k1} ; AVX512F-FAST-NEXT: movb $-86, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm30 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm26 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-FAST-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 896(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-FAST-NEXT: addq $2312, %rsp # imm = 0x908 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -7502,285 +7494,290 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm17, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm27, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm26, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm27, %zmm28 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm27 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm28 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm25 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm11, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm8, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm29 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm30 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm11 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm7 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm3, %zmm19 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm0 {%k1} ; AVX512BW-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2} ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll index d19eba1cbd51d..cabe23547039b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -436,49 +436,49 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movaps 112(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps 64(%rdi), %xmm10 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm12 ; SSE-NEXT: movaps 96(%rsi), %xmm0 ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps 64(%rsi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm5 -; SSE-NEXT: movaps 48(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movaps 48(%rsi), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -490,62 +490,62 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; SSE-NEXT: movaps 144(%rdi), %xmm10 -; SSE-NEXT: movaps 144(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: movaps 160(%rdi), %xmm12 +; SSE-NEXT: movaps 128(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps 160(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movaps 176(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movaps 176(%rdi), %xmm8 ; SSE-NEXT: movaps 176(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movaps 208(%rdi), %xmm1 -; SSE-NEXT: movaps 208(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 192(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movaps 208(%rdi), %xmm5 +; SSE-NEXT: movaps 208(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps 224(%rdi), %xmm1 ; SSE-NEXT: movaps 224(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps 240(%rsi), %xmm5 +; SSE-NEXT: movaps 240(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, 496(%rdx) ; SSE-NEXT: movaps %xmm0, 480(%rdx) -; SSE-NEXT: movaps %xmm2, 464(%rdx) -; SSE-NEXT: movaps %xmm4, 448(%rdx) -; SSE-NEXT: movaps %xmm1, 432(%rdx) -; SSE-NEXT: movaps %xmm6, 416(%rdx) -; SSE-NEXT: movaps %xmm7, 400(%rdx) -; SSE-NEXT: movaps %xmm8, 384(%rdx) -; SSE-NEXT: movaps %xmm9, 368(%rdx) +; SSE-NEXT: movaps %xmm1, 464(%rdx) +; SSE-NEXT: movaps %xmm2, 448(%rdx) +; SSE-NEXT: movaps %xmm5, 432(%rdx) +; SSE-NEXT: movaps %xmm7, 416(%rdx) +; SSE-NEXT: movaps %xmm6, 400(%rdx) +; SSE-NEXT: movaps %xmm9, 384(%rdx) +; SSE-NEXT: movaps %xmm8, 368(%rdx) ; SSE-NEXT: movaps %xmm11, 352(%rdx) -; SSE-NEXT: movaps %xmm12, 336(%rdx) -; SSE-NEXT: movaps %xmm13, 320(%rdx) -; SSE-NEXT: movaps %xmm10, 304(%rdx) -; SSE-NEXT: movaps %xmm15, 288(%rdx) -; SSE-NEXT: movaps %xmm14, 272(%rdx) +; SSE-NEXT: movaps %xmm10, 336(%rdx) +; SSE-NEXT: movaps %xmm12, 320(%rdx) +; SSE-NEXT: movaps %xmm13, 304(%rdx) +; SSE-NEXT: movaps %xmm14, 288(%rdx) +; SSE-NEXT: movaps %xmm15, 272(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index a4482bafbd535..afe43e3c7379d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -191,42 +191,42 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm3 -; SSE-NEXT: movaps 16(%rsi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps 16(%rsi), %xmm6 ; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: movaps 16(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] ; SSE-NEXT: movaps %xmm9, (%rcx) -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm5, 64(%rcx) -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps %xmm5, 16(%rcx) +; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps %xmm6, 64(%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf8: @@ -374,83 +374,83 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm5 +; SSE-NEXT: movaps (%rsi), %xmm7 ; SSE-NEXT: movaps 16(%rsi), %xmm9 ; SSE-NEXT: movaps 32(%rsi), %xmm10 ; SSE-NEXT: movaps 48(%rsi), %xmm11 ; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rdx), %xmm7 -; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: movaps 48(%rdx), %xmm8 +; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm8[0,3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm10[3,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm13[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] ; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0] +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] ; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm14[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm13, (%rcx) -; SSE-NEXT: movaps %xmm5, 16(%rcx) -; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[2,0] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,3] +; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps %xmm7, 16(%rcx) +; SSE-NEXT: movaps %xmm13, 48(%rcx) ; SSE-NEXT: movaps %xmm9, 64(%rcx) -; SSE-NEXT: movaps %xmm14, 96(%rcx) +; SSE-NEXT: movaps %xmm12, 96(%rcx) ; SSE-NEXT: movaps %xmm10, 112(%rcx) ; SSE-NEXT: movaps %xmm6, 144(%rcx) ; SSE-NEXT: movaps %xmm11, 160(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] ; SSE-NEXT: movaps %xmm2, 80(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: movaps %xmm1, 128(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: movaps %xmm8, 176(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: movaps %xmm4, 128(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] +; SSE-NEXT: movaps %xmm5, 176(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf16: @@ -705,9 +705,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps (%rsi), %xmm12 -; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps 32(%rsi), %xmm10 +; SSE-NEXT: movaps (%rsi), %xmm10 +; SSE-NEXT: movaps 16(%rsi), %xmm13 +; SSE-NEXT: movaps 32(%rsi), %xmm12 ; SSE-NEXT: movaps 48(%rsi), %xmm9 ; SSE-NEXT: movaps (%rdx), %xmm5 ; SSE-NEXT: movaps 16(%rdx), %xmm6 @@ -715,47 +715,47 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 48(%rdx), %xmm8 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movaps %xmm5, %xmm11 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] ; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm13[3,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm6[1,1] ; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm10[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm12[3,3] ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] @@ -772,62 +772,63 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm9 ; SSE-NEXT: movaps 64(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] +; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rsi), %xmm8 -; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm10, %xmm11 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm1[0,2] -; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,3] -; SSE-NEXT: movaps 96(%rsi), %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm3[0,2] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdx), %xmm10 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] +; SSE-NEXT: movaps 96(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rdx), %xmm13 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[0,3] +; SSE-NEXT: movaps 96(%rsi), %xmm5 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] +; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps 112(%rdx), %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm7[0,3] ; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[2,3] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -840,19 +841,19 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm10[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps %xmm4, 336(%rcx) -; SSE-NEXT: movaps %xmm6, 304(%rcx) -; SSE-NEXT: movaps %xmm7, 288(%rcx) +; SSE-NEXT: movaps %xmm3, 336(%rcx) +; SSE-NEXT: movaps %xmm5, 304(%rcx) +; SSE-NEXT: movaps %xmm6, 288(%rcx) ; SSE-NEXT: movaps %xmm8, 256(%rcx) ; SSE-NEXT: movaps %xmm11, 240(%rcx) ; SSE-NEXT: movaps %xmm12, 208(%rcx) -; SSE-NEXT: movaps %xmm13, 192(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -869,12 +870,12 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] -; SSE-NEXT: movaps %xmm3, 368(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm0, 320(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 272(%rcx) +; SSE-NEXT: movaps %xmm0, 368(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: movaps %xmm4, 320(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: movaps %xmm10, 272(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] ; SSE-NEXT: movaps %xmm9, 224(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -886,8 +887,8 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq @@ -1064,25 +1065,25 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] @@ -1096,9 +1097,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 288(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 288(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm14, 352(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) @@ -1117,7 +1118,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride3_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 @@ -1142,22 +1143,22 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm14[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6],ymm6[7] ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm7[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] @@ -1168,10 +1169,10 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6],ymm10[7] ; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] @@ -1197,11 +1198,11 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm3, 288(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm12, 160(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm13, 128(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm5, 256(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) @@ -1269,25 +1270,25 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] @@ -1301,9 +1302,9 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 288(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 288(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 352(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) @@ -1371,80 +1372,81 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps (%rsi), %xmm12 +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm0 ; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps 32(%rsi), %xmm10 -; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] +; SSE-NEXT: movaps 32(%rsi), %xmm14 +; SSE-NEXT: movaps 48(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm7 +; SSE-NEXT: movaps 16(%rdx), %xmm8 +; SSE-NEXT: movaps 32(%rdx), %xmm9 +; SSE-NEXT: movaps 48(%rdx), %xmm10 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[0,3] +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm11[3,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[0,3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm14[3,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] +; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1458,10 +1460,11 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1475,10 +1478,11 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1492,10 +1496,11 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1509,10 +1514,11 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -1525,231 +1531,238 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movaps 144(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3] +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 144(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm14 +; SSE-NEXT: movaps 160(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 160(%rsi), %xmm15 -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps 176(%rdi), %xmm12 +; SSE-NEXT: movaps 176(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm15[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 176(%rdx), %xmm2 +; SSE-NEXT: movaps 176(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 176(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm13 +; SSE-NEXT: movaps 192(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm11[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps 192(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 192(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps 208(%rdi), %xmm6 +; SSE-NEXT: movaps 208(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps 208(%rdi), %xmm8 -; SSE-NEXT: movaps 208(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 208(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movaps 208(%rsi), %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 224(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps 224(%rdx), %xmm15 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm15[0,3] +; SSE-NEXT: movaps 224(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2] +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps 240(%rdx), %xmm9 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] -; SSE-NEXT: movaps 224(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[0,3] +; SSE-NEXT: movaps 240(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdx), %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm12[0,3] -; SSE-NEXT: movaps 240(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,2],mem[2,3] -; SSE-NEXT: shufps $233, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm12[2,3] -; SSE-NEXT: movaps %xmm1, 736(%rcx) -; SSE-NEXT: movaps %xmm4, 720(%rcx) -; SSE-NEXT: movaps %xmm5, 688(%rcx) -; SSE-NEXT: movaps %xmm6, 672(%rcx) -; SSE-NEXT: movaps %xmm7, 640(%rcx) -; SSE-NEXT: movaps %xmm10, 624(%rcx) -; SSE-NEXT: movaps %xmm14, 592(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 576(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 544(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 528(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 496(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 480(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rcx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm0, 752(%rcx) +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3] +; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[1,2],mem[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3] +; SSE-NEXT: movaps %xmm0, 736(%rcx) +; SSE-NEXT: movaps %xmm3, 720(%rcx) +; SSE-NEXT: movaps %xmm4, 688(%rcx) +; SSE-NEXT: movaps %xmm7, 672(%rcx) +; SSE-NEXT: movaps %xmm8, 640(%rcx) +; SSE-NEXT: movaps %xmm10, 624(%rcx) +; SSE-NEXT: movaps %xmm11, 592(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 576(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 544(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 528(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 704(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: movaps %xmm8, 656(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] -; SSE-NEXT: movaps %xmm11, 608(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: movaps %xmm15, 560(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: movaps %xmm9, 512(%rcx) +; SSE-NEXT: movaps %xmm2, 752(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] +; SSE-NEXT: movaps %xmm5, 704(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: movaps %xmm6, 656(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: movaps %xmm13, 464(%rcx) -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,3] +; SSE-NEXT: movaps %xmm13, 608(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] +; SSE-NEXT: movaps %xmm12, 560(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0,1,3] +; SSE-NEXT: movaps %xmm14, 512(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: movaps %xmm0, 464(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 416(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] @@ -1763,7 +1776,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 224(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2024,63 +2037,62 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i32_stride3_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 64(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] @@ -2092,8 +2104,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2104,25 +2116,26 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,0,2,1] +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[2,1,3,3] +; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm10 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 160(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2143,7 +2156,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 192(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] @@ -2164,51 +2177,50 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 248(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 248(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd $165, (%rsp), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] @@ -2224,19 +2236,18 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 608(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 512(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 320(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 320(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 640(%rcx) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rcx) @@ -2269,95 +2280,95 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm8 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm10[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5,6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm3 @@ -2368,74 +2379,74 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm2[1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd 184(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6],ymm0[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6],ymm1[7] ; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm9 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0],ymm4[1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vbroadcastsd 216(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm10 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] ; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm11 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vbroadcastsd 248(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovaps %ymm7, 736(%rcx) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovaps %ymm6, 736(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm10, 704(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 672(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 640(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 608(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 544(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 512(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 480(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 672(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 640(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 608(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 576(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 544(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 512(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 480(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm8, 448(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm15, 416(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm14, 384(%rcx) @@ -2469,63 +2480,62 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i32_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 64(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] @@ -2537,8 +2547,8 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2549,25 +2559,26 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 160(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2588,7 +2599,7 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 192(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] @@ -2609,51 +2620,50 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd $165, (%rsp), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] @@ -2669,19 +2679,18 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 704(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 608(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 512(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 320(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 640(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll index 94f9a018fe9e4..bf6991e907177 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -521,14 +521,14 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[1],xmm9[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm15[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -555,9 +555,9 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[1],xmm14[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[1],xmm15[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0 @@ -566,8 +566,8 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm15[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm14[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0] @@ -596,8 +596,8 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[2],xmm1[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm15[3,0],xmm13[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -836,12 +836,13 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; SSE-NEXT: movaps %xmm13, %xmm0 @@ -850,98 +851,97 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps 64(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps 64(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: movaps 80(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps 80(%rdi), %xmm11 ; SSE-NEXT: movaps 80(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdx), %xmm1 -; SSE-NEXT: movaps 96(%rcx), %xmm4 +; SSE-NEXT: movaps 96(%rcx), %xmm6 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 96(%rdi), %xmm3 -; SSE-NEXT: movaps 96(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: movaps %xmm6, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 96(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 112(%rdx), %xmm2 -; SSE-NEXT: movaps 112(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movaps 112(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 112(%rsi), %xmm8 +; SSE-NEXT: movaps 112(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, 496(%r8) -; SSE-NEXT: movaps %xmm7, 480(%r8) +; SSE-NEXT: movaps %xmm4, 480(%r8) ; SSE-NEXT: movaps %xmm1, 464(%r8) -; SSE-NEXT: movaps %xmm4, 448(%r8) -; SSE-NEXT: movaps %xmm3, 432(%r8) -; SSE-NEXT: movaps %xmm11, 416(%r8) -; SSE-NEXT: movaps %xmm6, 400(%r8) -; SSE-NEXT: movaps %xmm13, 384(%r8) -; SSE-NEXT: movaps %xmm5, 368(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%r8) -; SSE-NEXT: movaps %xmm15, 336(%r8) +; SSE-NEXT: movaps %xmm3, 448(%r8) +; SSE-NEXT: movaps %xmm5, 432(%r8) +; SSE-NEXT: movaps %xmm10, 416(%r8) +; SSE-NEXT: movaps %xmm9, 400(%r8) +; SSE-NEXT: movaps %xmm12, 384(%r8) +; SSE-NEXT: movaps %xmm11, 368(%r8) +; SSE-NEXT: movaps %xmm15, 352(%r8) +; SSE-NEXT: movaps %xmm8, 336(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%r8) -; SSE-NEXT: movaps %xmm10, 304(%r8) +; SSE-NEXT: movaps %xmm13, 304(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%r8) -; SSE-NEXT: movaps %xmm12, 272(%r8) +; SSE-NEXT: movaps %xmm14, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps %xmm14, 208(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -973,7 +973,7 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 @@ -999,7 +999,7 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] @@ -1015,12 +1015,13 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1031,26 +1032,27 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1061,41 +1063,39 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm5[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm12[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm13 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm13[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[1],xmm8[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1112,6 +1112,19 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] @@ -1124,72 +1137,61 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm12[2],xmm1[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm13[2],xmm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm9[2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[3,0],xmm10[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm1[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm6[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[3,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm11[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm6[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[3,0],xmm4[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm11[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1208,54 +1210,54 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: addq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 @@ -1263,85 +1265,85 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3],ymm13[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm14[0],ymm3[0],ymm14[1],ymm3[1],ymm14[4],ymm3[4],ymm14[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[4],ymm15[4],ymm13[5],ymm15[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[4],ymm4[4],ymm14[5],ymm4[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[4],ymm15[4],ymm13[5],ymm15[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[4],ymm3[4],ymm13[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[6],ymm0[6],ymm15[7],ymm0[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 448(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 384(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) @@ -1576,142 +1578,142 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 64(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps 64(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm0 ; SSE-NEXT: movaps 80(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm5 ; SSE-NEXT: movaps 80(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdx), %xmm0 ; SSE-NEXT: movaps 96(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm5 ; SSE-NEXT: movaps 96(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdx), %xmm0 ; SSE-NEXT: movaps 112(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm5 ; SSE-NEXT: movaps 112(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdx), %xmm0 ; SSE-NEXT: movaps 128(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm5 ; SSE-NEXT: movaps 128(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdx), %xmm0 ; SSE-NEXT: movaps 144(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm5 ; SSE-NEXT: movaps 144(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdx), %xmm0 ; SSE-NEXT: movaps 160(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps 160(%rdi), %xmm5 ; SSE-NEXT: movaps 160(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdx), %xmm0 ; SSE-NEXT: movaps 176(%rcx), %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 @@ -1733,90 +1735,90 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 192(%rdx), %xmm0 ; SSE-NEXT: movaps 192(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movaps 192(%rdi), %xmm12 -; SSE-NEXT: movaps 192(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: movaps 192(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] ; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 208(%rdx), %xmm0 ; SSE-NEXT: movaps 208(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm3 -; SSE-NEXT: movaps 208(%rsi), %xmm9 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps 208(%rdi), %xmm13 +; SSE-NEXT: movaps 208(%rsi), %xmm7 +; SSE-NEXT: movaps %xmm13, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdx), %xmm1 -; SSE-NEXT: movaps 224(%rcx), %xmm2 +; SSE-NEXT: movaps 224(%rcx), %xmm6 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 224(%rdi), %xmm7 -; SSE-NEXT: movaps 224(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: movaps %xmm8, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps 224(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; SSE-NEXT: movaps 240(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rsi), %xmm10 +; SSE-NEXT: movaps 240(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, 1008(%r8) -; SSE-NEXT: movaps %xmm5, 992(%r8) +; SSE-NEXT: movaps %xmm4, 992(%r8) ; SSE-NEXT: movaps %xmm1, 976(%r8) -; SSE-NEXT: movaps %xmm4, 960(%r8) -; SSE-NEXT: movaps %xmm7, 944(%r8) -; SSE-NEXT: movaps %xmm11, 928(%r8) -; SSE-NEXT: movaps %xmm8, 912(%r8) -; SSE-NEXT: movaps %xmm14, 896(%r8) -; SSE-NEXT: movaps %xmm3, 880(%r8) +; SSE-NEXT: movaps %xmm3, 960(%r8) +; SSE-NEXT: movaps %xmm5, 944(%r8) +; SSE-NEXT: movaps %xmm10, 928(%r8) +; SSE-NEXT: movaps %xmm9, 912(%r8) +; SSE-NEXT: movaps %xmm11, 896(%r8) +; SSE-NEXT: movaps %xmm13, 880(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%r8) -; SSE-NEXT: movaps %xmm6, 848(%r8) +; SSE-NEXT: movaps %xmm8, 848(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%r8) ; SSE-NEXT: movaps %xmm12, 816(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%r8) -; SSE-NEXT: movaps %xmm13, 784(%r8) +; SSE-NEXT: movaps %xmm14, 784(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%r8) ; SSE-NEXT: movaps %xmm15, 752(%r8) @@ -2081,7 +2083,7 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] @@ -2116,25 +2118,25 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2145,11 +2147,11 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm4[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2163,210 +2165,210 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[1],xmm7[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 -; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[1],xmm7[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm0[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm12[2],xmm0[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm13[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[3,0],xmm14[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm9[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm4[2],xmm5[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm4[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm0[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[3,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm3[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm11[3,0],xmm0[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm1[2],xmm6[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm1[2],xmm5[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm9[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -2374,8 +2376,8 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm2, 928(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 864(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 800(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 736(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 672(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 736(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 672(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2603,49 +2605,49 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[4],ymm0[4],ymm11[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[6],ymm0[6],ymm11[7],ymm0[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] @@ -2684,13 +2686,13 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm1, 864(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 832(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 704(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 608(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 576(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 704(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 608(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 576(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 17bd3eb320104..4bb7f80a79564 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -684,160 +684,160 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa 32(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm10 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 ; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm12 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps 32(%rcx), %xmm11 +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps 16(%rcx), %xmm14 +; SSE-NEXT: movaps 32(%rcx), %xmm12 ; SSE-NEXT: movaps (%r8), %xmm3 ; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps 32(%r8), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 32(%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm8[3,3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa 48(%rdx), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm7 +; SSE-NEXT: movaps 48(%rcx), %xmm9 ; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm13, %xmm9 +; SSE-NEXT: movaps %xmm14, %xmm10 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,0] -; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] -; SSE-NEXT: movaps %xmm7, 288(%r9) -; SSE-NEXT: movaps %xmm3, 272(%r9) +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,0] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm9, 288(%r9) +; SSE-NEXT: movaps %xmm4, 272(%r9) ; SSE-NEXT: movdqa %xmm5, 240(%r9) ; SSE-NEXT: movaps %xmm15, 208(%r9) -; SSE-NEXT: movaps %xmm11, 192(%r9) +; SSE-NEXT: movaps %xmm12, 192(%r9) ; SSE-NEXT: movdqa %xmm6, 160(%r9) -; SSE-NEXT: movaps %xmm13, 128(%r9) -; SSE-NEXT: movaps %xmm9, 112(%r9) -; SSE-NEXT: movdqa %xmm12, 80(%r9) +; SSE-NEXT: movaps %xmm14, 128(%r9) +; SSE-NEXT: movaps %xmm10, 112(%r9) +; SSE-NEXT: movdqa %xmm8, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -846,16 +846,16 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm4, 256(%r9) +; SSE-NEXT: movaps %xmm2, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm2, 176(%r9) +; SSE-NEXT: movaps %xmm3, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm10, 96(%r9) +; SSE-NEXT: movaps %xmm7, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) +; SSE-NEXT: movaps %xmm13, 16(%r9) ; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; @@ -866,33 +866,33 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm10[1],xmm6[1],zero ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm8[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm8[0],xmm7[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm9[0],xmm8[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -910,7 +910,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm11[2],xmm10[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1],xmm10[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,1],ymm2[1,1],ymm9[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4],ymm10[5,6,7] @@ -931,21 +931,21 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm8[2],xmm7[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm9[2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm9 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3],xmm8[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3],xmm9[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,3],ymm0[3,3],ymm15[7,7],ymm0[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm14[3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm14[2],xmm13[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1],xmm13[1,1] @@ -957,11 +957,11 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm4[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] @@ -969,15 +969,15 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1118,116 +1118,116 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm4 ; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm3 ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm12 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <0,1,0,1,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm15 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm13[2],xmm8[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm8 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm11, %ymm12 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm11[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm12 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2,3],ymm6[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm10[1,2,3,4],ymm14[5],ymm10[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2],ymm4[3,4],ymm14[5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2],ymm4[3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm9 -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1],ymm13[2],ymm5[3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm10 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm9, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4],ymm15[5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4],ymm15[5],ymm6[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm14, 288(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm10, 256(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm6, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm13, 288(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm8, 256(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1509,21 +1509,21 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: subq $712, %rsp # imm = 0x2C8 ; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa 16(%rsi), %xmm7 ; SSE-NEXT: movdqa 32(%rsi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm11 ; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa 32(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps 16(%rcx), %xmm5 ; SSE-NEXT: movaps 32(%rcx), %xmm6 ; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps 32(%r8), %xmm13 +; SSE-NEXT: movaps 16(%r8), %xmm13 +; SSE-NEXT: movaps 32(%r8), %xmm12 ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] @@ -1532,58 +1532,58 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm12 +; SSE-NEXT: movdqa 48(%rsi), %xmm8 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm8 -; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps 48(%rcx), %xmm2 +; SSE-NEXT: movaps 48(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps 64(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rsi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps 80(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm0 @@ -1595,22 +1595,22 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps 96(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rsi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rcx), %xmm15 ; SSE-NEXT: movaps 112(%r8), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1618,14 +1618,14 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm11 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] @@ -1633,7 +1633,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1641,106 +1641,107 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm15[1,1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm13[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm12[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 64(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: movaps 80(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] @@ -1756,37 +1757,34 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movaps %xmm14, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] ; SSE-NEXT: movaps 112(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] @@ -1802,7 +1800,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -1815,40 +1813,39 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm0[0],xmm9[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 608(%r9) -; SSE-NEXT: movaps %xmm12, 592(%r9) +; SSE-NEXT: movaps %xmm1, 608(%r9) +; SSE-NEXT: movaps %xmm11, 592(%r9) ; SSE-NEXT: movaps %xmm4, 560(%r9) -; SSE-NEXT: movaps %xmm7, 528(%r9) +; SSE-NEXT: movaps %xmm6, 528(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%r9) ; SSE-NEXT: movaps %xmm8, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) -; SSE-NEXT: movaps %xmm9, 432(%r9) +; SSE-NEXT: movaps %xmm12, 432(%r9) ; SSE-NEXT: movaps %xmm13, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r9) @@ -1888,10 +1885,10 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm5, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) -; SSE-NEXT: movaps %xmm6, 416(%r9) +; SSE-NEXT: movaps %xmm7, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps %xmm11, 336(%r9) +; SSE-NEXT: movaps %xmm9, 336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) ; SSE-NEXT: movaps %xmm10, 256(%r9) @@ -1905,136 +1902,135 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: addq $712, %rsp # imm = 0x2C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm9[1],xmm5[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0],xmm5[0],zero,zero +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm9[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0],xmm6[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm11[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0],xmm7[0],zero,zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm4[1],xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm3[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm4[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm1[1],xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0],xmm2[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm6[1,2,3],ymm13[4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm14[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm0[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0],xmm1[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2,3],ymm10[4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm12[1],xmm13[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = xmm12[0],xmm13[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2,3],ymm10[4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm9[2],xmm5[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm5[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1],ymm13[1,1],ymm5[5,5],ymm13[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm9[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm7[2],xmm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm15 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1],ymm15[1,1],ymm7[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3],xmm7[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,3],ymm6[3,3],ymm9[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4],ymm9[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4],ymm9[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm9[2],ymm5[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm11[2],xmm7[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm9[2],ymm7[3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm4[2],xmm3[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5],ymm3[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1],xmm2[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[1,1],ymm4[5,5],ymm3[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm3[3,3] @@ -2042,24 +2038,24 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3],ymm4[3,3],ymm12[7,7],ymm4[7,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3],ymm4[3,3],ymm14[7,7],ymm4[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4],ymm4[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm2[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm0[2],xmm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2068,110 +2064,109 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm11[1,1],ymm9[5,5],ymm11[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm10[1,1],ymm9[5,5],ymm10[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4],ymm0[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,3],ymm8[3,3],ymm5[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm3[1,2,3,4],ymm11[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm11[2],ymm4[3,4,5,6],ymm11[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,zero,xmm14[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm12[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4,5],ymm4[6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm5[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1],ymm14[1,1],ymm3[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm4[2],xmm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1],xmm3[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm12[1,1],ymm13[5,5],ymm12[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm15[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm4[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,3],ymm2[3,3],ymm15[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm4[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm5[1,2,3,4],ymm0[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm4[3,3],ymm1[3,3],ymm4[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3,4],ymm3[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm3[2],ymm11[3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4],ymm11[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3],ymm11[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm11[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, 544(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm14[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1,2,3],ymm11[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2],mem[3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm12[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 608(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 576(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 608(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) @@ -2195,7 +2190,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: addq $600, %rsp # imm = 0x258 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2310,11 +2305,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2325,128 +2320,128 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[6],ymm6[6],ymm12[7],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm8 -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vpermilps $78, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1,2,3],ymm12[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, 544(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 384(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm15, 608(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 384(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 224(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 64(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 608(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%r9) @@ -2478,47 +2473,47 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: subq $600, %rsp # imm = 0x258 ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm11 ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm8[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm11[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <0,1,0,1,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm10[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2],xmm11[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm11 @@ -2531,13 +2526,12 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2],xmm11[3] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm14 +; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm13[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%r8), %ymm11 @@ -2545,153 +2539,154 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm12, %ymm7 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm14 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1,2],ymm7[3,4],ymm10[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3,4],ymm8[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3,4],ymm8[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm12, %ymm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2,3,4],ymm15[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3,4],ymm11[5,6],ymm15[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4],ymm11[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2],ymm15[3,4],ymm6[5,6],ymm15[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 120(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3],ymm14[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0,1,2],mem[3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] @@ -2700,32 +2695,32 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 544(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm3, 384(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm2, 384(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm12, 64(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm11, 608(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm6, 608(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%r9) @@ -2751,7 +2746,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $584, %rsp # imm = 0x248 +; AVX2-FAST-NEXT: addq $600, %rsp # imm = 0x258 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -2866,11 +2861,11 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2881,128 +2876,128 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[6],ymm6[6],ymm12[7],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $78, (%rsp), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1,2,3],ymm12[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 544(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 384(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 608(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 384(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 608(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%r9) @@ -3262,54 +3257,55 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1736, %rsp # imm = 0x6C8 -; SSE-NEXT: movdqa (%rsi), %xmm12 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm5 ; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps 32(%rcx), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: movaps 32(%rcx), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r8), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm10 +; SSE-NEXT: movdqa 48(%rsi), %xmm12 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 48(%rcx), %xmm6 -; SSE-NEXT: movaps 48(%r8), %xmm13 +; SSE-NEXT: movaps 48(%r8), %xmm7 ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 @@ -3318,10 +3314,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm14 +; SSE-NEXT: movaps 64(%rcx), %xmm13 ; SSE-NEXT: movaps 64(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3332,10 +3328,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rcx), %xmm14 ; SSE-NEXT: movaps 80(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3431,7 +3427,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%r8), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] @@ -3479,47 +3475,47 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps 16(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm11[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm11[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3527,13 +3523,13 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -3551,47 +3547,46 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm13[1,1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movaps 80(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm4 @@ -3601,16 +3596,16 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm14, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -3707,13 +3702,13 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 160(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: movaps 160(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] @@ -3721,7 +3716,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3729,43 +3724,43 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: movaps 176(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3773,62 +3768,62 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movaps %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] @@ -3848,11 +3843,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] @@ -3896,10 +3891,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] @@ -3910,27 +3905,27 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] @@ -3947,17 +3942,17 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 1248(%r9) ; SSE-NEXT: movaps %xmm3, 1232(%r9) ; SSE-NEXT: movaps %xmm6, 1200(%r9) -; SSE-NEXT: movaps %xmm8, 1168(%r9) +; SSE-NEXT: movaps %xmm7, 1168(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1152(%r9) -; SSE-NEXT: movaps %xmm9, 1120(%r9) +; SSE-NEXT: movaps %xmm8, 1120(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1088(%r9) -; SSE-NEXT: movaps %xmm10, 1072(%r9) -; SSE-NEXT: movaps %xmm12, 1040(%r9) +; SSE-NEXT: movaps %xmm9, 1072(%r9) +; SSE-NEXT: movaps %xmm13, 1040(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1008(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 992(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 960(%r9) @@ -4041,22 +4036,22 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm4, 1136(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%r9) -; SSE-NEXT: movaps %xmm7, 1056(%r9) +; SSE-NEXT: movaps %xmm12, 1056(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1024(%r9) -; SSE-NEXT: movaps %xmm11, 976(%r9) +; SSE-NEXT: movaps %xmm10, 976(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 944(%r9) -; SSE-NEXT: movaps %xmm13, 896(%r9) +; SSE-NEXT: movaps %xmm11, 896(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%r9) -; SSE-NEXT: movaps %xmm15, 816(%r9) +; SSE-NEXT: movaps %xmm14, 816(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%r9) ; SSE-NEXT: movaps %xmm2, 736(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 704(%r9) -; SSE-NEXT: movaps %xmm14, 656(%r9) +; SSE-NEXT: movaps %xmm15, 656(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 624(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4079,7 +4074,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) @@ -4095,72 +4090,72 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1784, %rsp # imm = 0x6F8 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm8[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm7[0],xmm8[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm6[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm14[1],xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm14[0],xmm12[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm15[1],xmm14[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm15[0],xmm14[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm13[1],xmm10[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm13[0],xmm10[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm10[1],xmm13[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm13[0],zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm5[1],xmm11[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0],xmm11[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm7[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm9, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 @@ -4173,31 +4168,31 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm9[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm9[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm4[1,2,3],ymm6[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm8[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = xmm8[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm4[1,2,3],ymm9[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm9 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm4[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm4[1],xmm0[1],zero ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3],ymm15[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1,2,3],ymm0[4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4205,15 +4200,15 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm6[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm6[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm9[1],xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4230,110 +4225,108 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm0[1],xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm0[0],xmm1[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = xmm0[0],xmm1[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm7[2],xmm8[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm14[2],xmm12[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm15[2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm1[1,1],ymm14[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm14[1,1],ymm15[5,5],ymm14[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm10[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4341,31 +4334,32 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm11[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm7[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4387,28 +4381,28 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm3[3,3],ymm2[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vbroadcastss 132(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm9[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 132(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm8[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4418,11 +4412,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm1 @@ -4430,11 +4424,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm3[3,3],ymm2[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 @@ -4444,7 +4438,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 164(%rcx), %xmm1 @@ -4483,7 +4477,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4491,9 +4485,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 196(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm6[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm9[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4505,8 +4499,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[1,1],ymm11[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm1 @@ -4515,12 +4510,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3],ymm8[3,3],ymm9[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] @@ -4543,61 +4538,59 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm5[1,1],ymm4[5,5],ymm5[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm0[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2],ymm6[3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0],ymm3[1,2,3,4],ymm10[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4],ymm7[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm10[2],ymm6[3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm7[2],ymm5[3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm15[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm11[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2],ymm15[3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4633,30 +4626,30 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1024(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1024(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 864(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 704(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4671,7 +4664,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) @@ -4683,7 +4676,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) @@ -4936,18 +4929,18 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm15 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] @@ -4955,162 +4948,160 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm8 ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 @@ -5122,7 +5113,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5130,104 +5121,103 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 216(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 248(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4],ymm11[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4],ymm7[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 248(%r8), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3],ymm7[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -5242,20 +5232,19 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1184(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 1024(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 864(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 864(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 544(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 544(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 384(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1248(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 1216(%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1088(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1056(%r9) @@ -5322,302 +5311,297 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1800, %rsp # imm = 0x708 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,1,0,1,u,u,2,2> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <0,1,0,1,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm15 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm12[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm6 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm6 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm12 = xmm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2],xmm12[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%r8), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4,5],ymm5[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm12 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovaps 128(%rdx), %xmm0 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm1 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps 128(%r8), %ymm0 +; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm12 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps 160(%r8), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 160(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm12 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps 192(%r8), %ymm0 +; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2],xmm15[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [0,1,3,2,3,2,3,2] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [0,1,3,2,3,2,3,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2],ymm11[3,4],ymm15[5,6],ymm11[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm10[1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm15 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2],ymm5[3,4],ymm9[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm3, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3,4],ymm6[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4],ymm3[5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 88(%r8), %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5630,35 +5614,35 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 128(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm13 ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5671,32 +5655,34 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 160(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1,2],ymm1[3,4],ymm11[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5709,30 +5695,30 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 192(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 208(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] @@ -5742,11 +5728,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] @@ -5755,75 +5741,73 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4],ymm11[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2],ymm11[3,4],ymm8[5,6],ymm11[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 248(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1,2,3],ymm8[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1,2,3],ymm8[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2],ymm9[3,4],ymm4[5,6],ymm9[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 248(%r8), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm15[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2],ymm12[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1,2,3],ymm8[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1],ymm9[2],ymm15[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2,3],ymm4[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] @@ -5835,23 +5819,24 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps $78, (%rsp), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vpermilps $78, (%rsp), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] @@ -5861,13 +5846,12 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 1184(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm4, 1024(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm9, 864(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm5, 1024(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm10, 864(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm8, 544(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm11, 384(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm4, 544(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm9, 384(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm12, 224(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6149,18 +6133,18 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] @@ -6168,162 +6152,160 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[6],ymm2[6],ymm14[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 @@ -6335,7 +6317,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6343,104 +6325,103 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4],ymm11[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3,4],ymm7[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2],ymm15[3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3],ymm7[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -6455,20 +6436,19 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1184(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 1024(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 864(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 864(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 704(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 544(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 544(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 384(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1248(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1216(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1088(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1056(%r9) @@ -6534,423 +6514,429 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i32_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm29 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm25 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm28 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm18, %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm26, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm22, %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm18, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm26, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm27, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm23, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm20, %zmm3, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm20, %zmm18, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm26, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm23, %zmm20 -; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm29 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm23, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm16, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm23, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm15 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 ; AVX512F-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} ; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} ; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k3} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm12, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm20, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm15 {%k2} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm13 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm2 {%k3} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 576(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm30, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 1152(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1216(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm29 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm28 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm18, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm18, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm3, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm18, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm26, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm23, %zmm20 -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm29 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm23, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm16, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm23, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm20, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm15 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm2 {%k3} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%r9) +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 576(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1216(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 4aee13aae36f9..96543b486b35d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -180,36 +180,36 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm4 ; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm5[3,3] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm5[1,1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[2,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps (%r8), %xmm7 +; SSE-NEXT: movaps (%r9), %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2] -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,2] +; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm2, 32(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm7, 64(%rax) -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) +; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf4: @@ -335,73 +335,76 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm7 +; SSE-NEXT: movaps (%rsi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rsi), %xmm10 -; SSE-NEXT: movaps (%rdx), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm8 ; SSE-NEXT: movaps 16(%rdx), %xmm2 -; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm6 ; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps (%r8), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm5 ; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm8 -; SSE-NEXT: movaps 16(%r9), %xmm4 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; SSE-NEXT: movaps (%r9), %xmm7 +; SSE-NEXT: movaps 16(%r9), %xmm3 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] ; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,0] -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] ; SSE-NEXT: movaps %xmm2, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm4[3,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm12[0,2] -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm8[3,3] -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm11[0,2] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm15, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm7[3,3] +; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm14[0,2] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm10[0,2] -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm14, 48(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm4, 112(%rax) -; SSE-NEXT: movaps %xmm14, 160(%rax) +; SSE-NEXT: movaps %xmm3, 112(%rax) +; SSE-NEXT: movaps %xmm13, 160(%rax) ; SSE-NEXT: movaps %xmm2, 176(%rax) -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm6, 64(%rax) -; SSE-NEXT: movaps %xmm12, 80(%rax) -; SSE-NEXT: movaps %xmm15, 128(%rax) -; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm6, 16(%rax) +; SSE-NEXT: movaps %xmm8, 64(%rax) +; SSE-NEXT: movaps %xmm11, 80(%rax) +; SSE-NEXT: movaps %xmm10, 128(%rax) +; SSE-NEXT: movaps %xmm12, 144(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf8: @@ -771,139 +774,139 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i32_stride6_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $72, %rsp -; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movaps (%rsi), %xmm8 -; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps (%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm9 +; SSE-NEXT: movaps 16(%rdx), %xmm10 ; SSE-NEXT: movaps (%rcx), %xmm1 -; SSE-NEXT: movaps 16(%rcx), %xmm14 -; SSE-NEXT: movaps (%r8), %xmm9 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps (%r9), %xmm2 -; SSE-NEXT: movaps 16(%r9), %xmm0 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[2,3] +; SSE-NEXT: movaps 16(%rcx), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm3 +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps (%r9), %xmm4 +; SSE-NEXT: movaps 16(%r9), %xmm13 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm5[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[2,3] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm11[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm13[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm13[3,3] ; SSE-NEXT: movaps 32(%rdx), %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movaps 32(%r8), %xmm2 -; SSE-NEXT: movaps 32(%r9), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: movaps 32(%r9), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm14[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm15[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] -; SSE-NEXT: movaps 48(%rdx), %xmm2 -; SSE-NEXT: movaps 48(%rcx), %xmm9 +; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: movaps 48(%rcx), %xmm10 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rsi), %xmm9 ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rsi), %xmm10 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; SSE-NEXT: movaps 48(%r8), %xmm3 +; SSE-NEXT: movaps 48(%r8), %xmm1 ; SSE-NEXT: movaps 48(%r9), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 368(%rax) -; SSE-NEXT: movaps %xmm10, 352(%rax) -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps %xmm4, 320(%rax) +; SSE-NEXT: movaps %xmm3, 368(%rax) +; SSE-NEXT: movaps %xmm9, 352(%rax) +; SSE-NEXT: movaps %xmm2, 336(%rax) +; SSE-NEXT: movaps %xmm5, 320(%rax) ; SSE-NEXT: movaps %xmm6, 304(%rax) -; SSE-NEXT: movaps %xmm5, 288(%rax) +; SSE-NEXT: movaps %xmm4, 288(%rax) ; SSE-NEXT: movaps %xmm13, 272(%rax) ; SSE-NEXT: movaps %xmm8, 256(%rax) ; SSE-NEXT: movaps %xmm12, 240(%rax) -; SSE-NEXT: movaps %xmm15, 224(%rax) +; SSE-NEXT: movaps %xmm14, 224(%rax) ; SSE-NEXT: movaps %xmm11, 208(%rax) -; SSE-NEXT: movaps %xmm14, 192(%rax) +; SSE-NEXT: movaps %xmm15, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -934,7 +937,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i32_stride6_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $104, %rsp -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 @@ -953,15 +956,15 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[4],ymm13[4],ymm4[5],ymm13[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[4],ymm13[4],ymm5[5],ymm13[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] @@ -999,34 +1002,34 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] @@ -1073,21 +1076,21 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) @@ -1104,17 +1107,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-LABEL: store_i32_stride6_vf16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $200, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1125,7 +1128,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm4 @@ -1134,12 +1137,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] @@ -1147,11 +1150,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 ; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm2 @@ -1159,12 +1162,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] @@ -1174,7 +1177,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm3 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm13, %ymm1 @@ -1188,56 +1191,56 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm12 +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] @@ -1251,7 +1254,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] @@ -1262,7 +1265,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 256(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1284,15 +1287,15 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-LABEL: store_i32_stride6_vf16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 @@ -1311,7 +1314,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] @@ -1326,7 +1329,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm2 @@ -1334,14 +1337,14 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] @@ -1351,7 +1354,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 ; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm3 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 @@ -1364,34 +1367,34 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm3[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] @@ -1404,17 +1407,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[4],ymm15[4],ymm7[5],ymm15[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] @@ -1423,10 +1426,10 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[4],ymm4[4],ymm14[5],ymm4[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] @@ -1461,17 +1464,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1482,7 +1485,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm4 @@ -1491,12 +1494,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] @@ -1504,11 +1507,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm2 @@ -1516,12 +1519,12 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] @@ -1531,7 +1534,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm1 @@ -1545,56 +1548,56 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] @@ -1608,7 +1611,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] @@ -1619,7 +1622,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2097,13 +2100,13 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm7 +; SSE-NEXT: movaps 48(%rdx), %xmm6 ; SSE-NEXT: movaps 48(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 48(%r8), %xmm2 ; SSE-NEXT: movaps 48(%r9), %xmm3 @@ -2117,26 +2120,26 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm6 ; SSE-NEXT: movaps 64(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps 64(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 64(%r8), %xmm2 ; SSE-NEXT: movaps 64(%r9), %xmm3 @@ -2150,26 +2153,26 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm6 ; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm7 ; SSE-NEXT: movaps 80(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 80(%r8), %xmm2 ; SSE-NEXT: movaps 80(%r9), %xmm3 @@ -2183,84 +2186,84 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm9 ; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm11 ; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movaps %xmm11, %xmm13 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps 96(%r9), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: movaps 96(%r9), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm2 -; SSE-NEXT: movaps 112(%rcx), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: movaps 112(%rdx), %xmm3 +; SSE-NEXT: movaps 112(%rcx), %xmm12 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rsi), %xmm10 ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 112(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps 112(%r8), %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movaps 112(%r8), %xmm1 ; SSE-NEXT: movaps 112(%r9), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 752(%rax) -; SSE-NEXT: movaps %xmm14, 736(%rax) -; SSE-NEXT: movaps %xmm0, 720(%rax) -; SSE-NEXT: movaps %xmm4, 704(%rax) +; SSE-NEXT: movaps %xmm3, 752(%rax) +; SSE-NEXT: movaps %xmm10, 736(%rax) +; SSE-NEXT: movaps %xmm2, 720(%rax) +; SSE-NEXT: movaps %xmm5, 704(%rax) ; SSE-NEXT: movaps %xmm6, 688(%rax) -; SSE-NEXT: movaps %xmm5, 672(%rax) -; SSE-NEXT: movaps %xmm11, 656(%rax) -; SSE-NEXT: movaps %xmm9, 640(%rax) -; SSE-NEXT: movaps %xmm10, 624(%rax) -; SSE-NEXT: movaps %xmm12, 608(%rax) +; SSE-NEXT: movaps %xmm4, 672(%rax) +; SSE-NEXT: movaps %xmm9, 656(%rax) +; SSE-NEXT: movaps %xmm8, 640(%rax) +; SSE-NEXT: movaps %xmm11, 624(%rax) +; SSE-NEXT: movaps %xmm14, 608(%rax) ; SSE-NEXT: movaps %xmm15, 592(%rax) ; SSE-NEXT: movaps %xmm13, 576(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2340,13 +2343,14 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1016, %rsp # imm = 0x3F8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 @@ -2373,10 +2377,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[4],ymm8[4],ymm12[5],ymm8[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2388,9 +2391,9 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -2400,15 +2403,16 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2418,11 +2422,11 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 @@ -2438,16 +2442,15 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 68(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2457,16 +2460,16 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 80(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm15[1,2] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,2],xmm9[1,2] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] @@ -2475,130 +2478,95 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm15[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm12[1,2],ymm0[5,6],ymm12[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,2],ymm10[1,2],ymm0[5,6],ymm10[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,2],ymm9[1,2],ymm10[5,6],ymm9[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,2],ymm13[1,2],ymm8[5,6],ymm13[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[6],ymm14[6],ymm6[7],ymm14[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,2],ymm14[1,2],ymm6[5,6],ymm14[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm14[1,2],mem[1,2],ymm14[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,2],ymm13[1,2],ymm12[5,6],ymm13[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,0,0,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2],ymm2[1,2],ymm10[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -2608,97 +2576,135 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm10[3,0],ymm9[7,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,0],ymm12[3,0],ymm13[7,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[3,0],ymm14[3,0],ymm5[7,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6],ymm10[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 576(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2721,7 +2727,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $1016, %rsp # imm = 0x3F8 +; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2768,7 +2774,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm12 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm5 @@ -2786,9 +2792,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm10 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -2801,19 +2808,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm14 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm8 ; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm9 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] @@ -2827,174 +2834,175 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,1,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm8 ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm10 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm13, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 64(%rcx), %xmm13 -; AVX2-SLOW-NEXT: vpbroadcastd 64(%rdx), %xmm15 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 64(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vpbroadcastd 64(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm15, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm12 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[6],ymm0[6],ymm10[7],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 116(%r9), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm15 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm10[1],ymm13[2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[4],ymm1[4],ymm14[5],ymm1[5] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] @@ -3004,32 +3012,30 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 80(%r9), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 80(%r9), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] @@ -3039,50 +3045,51 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 112(%r9), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[2,2,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 112(%r9), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 736(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 672(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 736(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 672(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 544(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 480(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 480(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3113,340 +3120,337 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $888, %rsp # imm = 0x378 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: subq $872, %rsp # imm = 0x368 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm7 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm8 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3],ymm8[4],ymm0[5],ymm8[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm8 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm7, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm14 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm11 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm10 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm8 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm13, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm11[1],ymm14[2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm11 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm14 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm11[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm11 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[4],ymm12[4],ymm15[5],ymm12[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[2,3],ymm4[2,3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 736(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 672(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 672(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 640(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 544(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 544(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm8, 448(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 352(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm13, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 256(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) @@ -3476,7 +3480,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $888, %rsp # imm = 0x378 +; AVX2-FAST-NEXT: addq $872, %rsp # imm = 0x368 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3523,7 +3527,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm5 @@ -3541,9 +3545,10 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -3556,19 +3561,19 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] @@ -3582,174 +3587,175 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[6],ymm0[6],ymm10[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 116(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm10[1],ymm13[2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[4],ymm1[4],ymm14[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] @@ -3759,32 +3765,30 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%r9), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] @@ -3794,50 +3798,51 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 112(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 112(%r9), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 736(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 672(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 736(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 672(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 480(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3869,135 +3874,135 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512F-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm16, %zmm15 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm18, %zmm17 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm13 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm16 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm18 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm12[2],zmm14[2],zmm12[3],zmm14[3],zmm12[6],zmm14[6],zmm12[7],zmm14[7],zmm12[10],zmm14[10],zmm12[11],zmm14[11],zmm12[14],zmm14[14],zmm12[15],zmm14[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm3, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 +; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 +; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm14 +; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512F-SLOW-NEXT: movb $36, %dl ; AVX512F-SLOW-NEXT: kmovw %edx, %k1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm22, %zmm12 -; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm20[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm3 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512F-SLOW-NEXT: movb $-110, %cl ; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm15 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm15 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512F-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm25, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm26, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm27, %zmm17 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm21, %zmm13 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm23, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm16 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm27, %zmm18 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 ; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm23, %zmm20 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm21[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm23, %zmm14 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[14],zmm4[14],zmm2[15],zmm4[15] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -4005,138 +4010,138 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm10 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm15 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm19 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 ; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm18, %zmm16 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm22, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm26 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512F-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm17 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm21 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm24, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm25 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm12 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm18 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm1 +; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 +; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 +; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 +; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 +; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 +; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 +; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 ; AVX512F-FAST-NEXT: movb $-110, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512F-FAST-NEXT: movb $36, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm9, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm15, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm9, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm15, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm7, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -4144,135 +4149,135 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm11 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm16, %zmm15 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm18, %zmm17 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm13 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm16 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm18 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm12[2],zmm14[2],zmm12[3],zmm14[3],zmm12[6],zmm14[6],zmm12[7],zmm14[7],zmm12[10],zmm14[10],zmm12[11],zmm14[11],zmm12[14],zmm14[14],zmm12[15],zmm14[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm3, %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm14 +; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 ; AVX512BW-SLOW-NEXT: movb $36, %dl ; AVX512BW-SLOW-NEXT: kmovd %edx, %k1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm14, %zmm12 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm22, %zmm12 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 ; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm20[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm3 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 ; AVX512BW-SLOW-NEXT: movb $-110, %cl ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm15 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm25, %zmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm26, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm27, %zmm17 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm21, %zmm13 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm23, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm16 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm27, %zmm18 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 ; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm23, %zmm20 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm21[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm23, %zmm14 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[14],zmm4[14],zmm2[15],zmm4[15] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm2[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm4, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm21, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm4, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -4280,138 +4285,138 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm10 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 ; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm15 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm19 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm13, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 ; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm18, %zmm16 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm22, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm26 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 ; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm17 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm21 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm24, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm25 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm12 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm13 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm18 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm1 +; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 +; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 +; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 +; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 +; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 +; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 +; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 +; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 ; AVX512BW-FAST-NEXT: movb $-110, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 ; AVX512BW-FAST-NEXT: movb $36, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm9, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm15, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm9, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm15, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm4, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm7, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm7, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4535,13 +4540,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm7 +; SSE-NEXT: movaps 48(%rdx), %xmm6 ; SSE-NEXT: movaps 48(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 48(%r8), %xmm2 ; SSE-NEXT: movaps 48(%r9), %xmm3 @@ -4555,26 +4560,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm6 ; SSE-NEXT: movaps 64(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps 64(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 64(%r8), %xmm2 ; SSE-NEXT: movaps 64(%r9), %xmm3 @@ -4588,26 +4593,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdx), %xmm6 ; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm7 ; SSE-NEXT: movaps 80(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 80(%r8), %xmm2 ; SSE-NEXT: movaps 80(%r9), %xmm3 @@ -4621,26 +4626,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm6 ; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm7 ; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 96(%r8), %xmm2 ; SSE-NEXT: movaps 96(%r9), %xmm3 @@ -4654,26 +4659,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdx), %xmm7 +; SSE-NEXT: movaps 112(%rdx), %xmm6 ; SSE-NEXT: movaps 112(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm7 ; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 112(%r8), %xmm2 ; SSE-NEXT: movaps 112(%r9), %xmm3 @@ -4687,26 +4692,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm6 ; SSE-NEXT: movaps 128(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm7 ; SSE-NEXT: movaps 128(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 128(%r8), %xmm2 ; SSE-NEXT: movaps 128(%r9), %xmm3 @@ -4720,26 +4725,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdx), %xmm6 ; SSE-NEXT: movaps 144(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm7 ; SSE-NEXT: movaps 144(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 144(%r8), %xmm2 ; SSE-NEXT: movaps 144(%r9), %xmm3 @@ -4753,26 +4758,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdx), %xmm6 ; SSE-NEXT: movaps 160(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps 160(%rdi), %xmm7 ; SSE-NEXT: movaps 160(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 160(%r8), %xmm2 ; SSE-NEXT: movaps 160(%r9), %xmm3 @@ -4786,26 +4791,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdx), %xmm6 ; SSE-NEXT: movaps 176(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 176(%rdi), %xmm6 +; SSE-NEXT: movaps 176(%rdi), %xmm7 ; SSE-NEXT: movaps 176(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 176(%r8), %xmm2 ; SSE-NEXT: movaps 176(%r9), %xmm3 @@ -4819,26 +4824,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdx), %xmm6 ; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 192(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 192(%r8), %xmm2 ; SSE-NEXT: movaps 192(%r9), %xmm3 @@ -4852,26 +4857,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdx), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdx), %xmm6 ; SSE-NEXT: movaps 208(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdi), %xmm6 +; SSE-NEXT: movaps 208(%rdi), %xmm7 ; SSE-NEXT: movaps 208(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 208(%r8), %xmm2 ; SSE-NEXT: movaps 208(%r9), %xmm3 @@ -4885,84 +4890,84 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdx), %xmm9 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm11 ; SSE-NEXT: movaps 224(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movaps %xmm11, %xmm13 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movaps 224(%r8), %xmm2 -; SSE-NEXT: movaps 224(%r9), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: movaps 224(%r9), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] -; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: movaps 240(%rdx), %xmm3 +; SSE-NEXT: movaps 240(%rcx), %xmm12 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps 240(%rsi), %xmm10 ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps 240(%r8), %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movaps 240(%r8), %xmm1 ; SSE-NEXT: movaps 240(%r9), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 1520(%rax) -; SSE-NEXT: movaps %xmm14, 1504(%rax) -; SSE-NEXT: movaps %xmm0, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1472(%rax) +; SSE-NEXT: movaps %xmm3, 1520(%rax) +; SSE-NEXT: movaps %xmm10, 1504(%rax) +; SSE-NEXT: movaps %xmm2, 1488(%rax) +; SSE-NEXT: movaps %xmm5, 1472(%rax) ; SSE-NEXT: movaps %xmm6, 1456(%rax) -; SSE-NEXT: movaps %xmm5, 1440(%rax) -; SSE-NEXT: movaps %xmm11, 1424(%rax) -; SSE-NEXT: movaps %xmm9, 1408(%rax) -; SSE-NEXT: movaps %xmm10, 1392(%rax) -; SSE-NEXT: movaps %xmm12, 1376(%rax) +; SSE-NEXT: movaps %xmm4, 1440(%rax) +; SSE-NEXT: movaps %xmm9, 1424(%rax) +; SSE-NEXT: movaps %xmm8, 1408(%rax) +; SSE-NEXT: movaps %xmm11, 1392(%rax) +; SSE-NEXT: movaps %xmm14, 1376(%rax) ; SSE-NEXT: movaps %xmm15, 1360(%rax) ; SSE-NEXT: movaps %xmm13, 1344(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5138,7 +5143,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2392, %rsp # imm = 0x958 +; AVX1-ONLY-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 @@ -5146,7 +5151,6 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 @@ -5176,6 +5180,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5199,17 +5205,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5219,11 +5225,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 @@ -5239,17 +5245,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 68(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5268,8 +5274,9 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5278,17 +5285,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5337,11 +5344,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 144(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm7 @@ -5355,17 +5362,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 164(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5375,11 +5382,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 176(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 @@ -5393,15 +5400,17 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastss 196(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5432,168 +5441,167 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,2],ymm15[1,2],ymm10[5,6],ymm15[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1] ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm13[1,2],mem[1,2],ymm13[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[6],ymm11[6],ymm0[7],ymm11[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[1,2],mem[1,2],ymm11[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,2],ymm15[1,2],ymm11[5,6],ymm15[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,0,0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 160(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm8[1,2],ymm6[5,6],ymm8[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,0,0,0] +; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm3[1,2],mem[1,2],ymm3[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,2],ymm2[1,2],ymm4[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 212(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm5[1,2],ymm1[5,6],ymm5[5,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,2],ymm1[1,2],ymm5[5,6],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vbroadcastss 244(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 244(%r9), %ymm1 @@ -5605,8 +5613,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 @@ -5622,15 +5630,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[3,0],ymm0[7,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] @@ -5645,8 +5652,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm1 @@ -5662,8 +5669,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5680,18 +5687,18 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -5699,73 +5706,98 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm13[2],mem[2],xmm13[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm12 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] @@ -5773,115 +5805,91 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm12[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm13[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm3[3,0],ymm0[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 224(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm6, 1504(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1408(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1312(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 1216(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1120(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 1024(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 1504(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 1344(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 1312(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 1120(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1024(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 928(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 832(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 832(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5950,13 +5958,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $2392, %rsp # imm = 0x958 +; AVX1-ONLY-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2488, %rsp # imm = 0x9B8 +; AVX2-SLOW-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 @@ -6152,11 +6160,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero @@ -6186,11 +6194,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6217,11 +6225,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6248,11 +6256,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6279,11 +6287,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6310,11 +6318,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6341,11 +6349,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 192(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -6601,7 +6609,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqa 160(%r8), %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 176(%r9), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -6616,8 +6624,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm11 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] @@ -6647,12 +6655,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 ; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm14 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] @@ -6672,8 +6680,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -6686,13 +6694,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm14, 1504(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 1440(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 1440(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 1408(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm12, 1312(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 1248(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm11, 1216(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, 1120(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 1056(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 1056(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm9, 1024(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 928(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 864(%rax) @@ -6768,26 +6776,26 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $2488, %rsp # imm = 0x9B8 +; AVX2-SLOW-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2312, %rsp # imm = 0x908 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-NEXT: subq $2376, %rsp # imm = 0x948 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3] @@ -6800,22 +6808,21 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm11 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] @@ -6825,650 +6832,658 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm12 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm8 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm9[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm14 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm14[0],zero,xmm14[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm5 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm6 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa 192(%r8), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm3 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm14 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 32(%rcx), %xmm0 -; AVX2-FAST-NEXT: vbroadcastss 32(%rdx), %xmm5 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 32(%r9), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm5 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 128(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 128(%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 128(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 128(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 128(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 148(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 148(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm15[0],mem[0],xmm15[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 160(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 160(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 180(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 180(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 192(%r9), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpbroadcastd %xmm11, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 192(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 212(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 212(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 224(%rcx), %xmm0 -; AVX2-FAST-NEXT: vbroadcastss 224(%rdx), %xmm3 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 224(%r9), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 224(%rdx), %xmm4 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd (%rsp), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 224(%r9), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 244(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %ymm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 244(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[1],ymm11[1],ymm3[4],ymm11[4],ymm3[5],ymm11[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 128(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[4],ymm15[4],ymm12[5],ymm15[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[6],ymm15[6],ymm12[7],ymm15[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3],ymm13[2,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[2,3],ymm15[2,3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqa 224(%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm0[1],ymm3[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa 224(%r9), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3],ymm12[2,3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[2,3],ymm8[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 1504(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 1504(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 1440(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 1408(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 1312(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 1248(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 1216(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 1120(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 1056(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 1024(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 928(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 1216(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 1120(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 1056(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 1024(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 928(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 864(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7545,13 +7560,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX2-FAST-NEXT: addq $2376, %rsp # imm = 0x948 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2488, %rsp # imm = 0x9B8 +; AVX2-FAST-PERLANE-NEXT: subq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 @@ -7747,11 +7762,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero @@ -7781,11 +7796,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7812,11 +7827,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7843,11 +7858,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7874,11 +7889,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7905,11 +7920,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -7936,11 +7951,11 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero @@ -8196,7 +8211,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 176(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -8211,8 +8226,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] @@ -8242,12 +8257,12 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] @@ -8267,8 +8282,8 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -8281,13 +8296,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 1504(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 1440(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 1440(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 1408(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 1312(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 1248(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 1216(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 1120(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 1056(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 1056(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 1024(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 928(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 864(%rax) @@ -8363,1153 +8378,1145 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2488, %rsp # imm = 0x9B8 +; AVX2-FAST-PERLANE-NEXT: addq $2504, %rsp # imm = 0x9C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i32_stride6_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm12 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm21, %zmm25 +; AVX512F-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm13 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm14 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512F-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm22 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm9, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm10, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm9, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm9, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm10, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm28, %zmm27 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm31, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm28, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm15, %zmm31, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm28, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm31, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512F-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm28 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm31, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm31 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [3,11,0,8,7,15,4,12] -; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 +; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 ; AVX512F-SLOW-NEXT: movb $36, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-SLOW-NEXT: vpermt2d %zmm20, %zmm21, %zmm19 -; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm13[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm21, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d 128(%rcx), %ymm0, %ymm31 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm31[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512F-SLOW-NEXT: vpermt2d 192(%rcx), %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm31 -; AVX512F-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm21 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm14, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm14, %zmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%r8), %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm14, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm3, %zmm15 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm7 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 ; AVX512F-SLOW-NEXT: movb $-110, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm2, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm8[2],zmm11[2],zmm8[3],zmm11[3],zmm8[6],zmm11[6],zmm8[7],zmm11[7],zmm8[10],zmm11[10],zmm8[11],zmm11[11],zmm8[14],zmm11[14],zmm8[15],zmm11[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm29 = zmm29[2],zmm4[2],zmm29[3],zmm4[3],zmm29[6],zmm4[6],zmm29[7],zmm4[7],zmm29[10],zmm4[10],zmm29[11],zmm4[11],zmm29[14],zmm4[14],zmm29[15],zmm4[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm2, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm24 = zmm24[2],zmm17[2],zmm24[3],zmm17[3],zmm24[6],zmm17[6],zmm24[7],zmm17[7],zmm24[10],zmm17[10],zmm24[11],zmm17[11],zmm24[14],zmm17[14],zmm24[15],zmm17[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm3 -; AVX512F-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm2 -; AVX512F-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm1 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm4 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm17[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm29[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k1} = zmm24[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm10 = zmm20[2],zmm31[2],zmm20[3],zmm31[3],zmm20[6],zmm31[6],zmm20[7],zmm31[7],zmm20[10],zmm31[10],zmm20[11],zmm31[11],zmm20[14],zmm31[14],zmm20[15],zmm31[15] -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm10[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%r9), %zmm10 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 192(%r9), %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm25 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm19 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm6 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm8 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm5 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm22 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm7 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm4 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm27 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm29 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm30 -; AVX512F-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 1408(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 1216(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-SLOW-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 1152(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 1280(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i32_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-FAST-NEXT: subq $1160, %rsp # imm = 0x488 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm31 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm12, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512F-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm27, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm9, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm11, %zmm16 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm12, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm13, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm27, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm13, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512F-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm13 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm27, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm27, %zmm14 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm25 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm27 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512F-FAST-NEXT: movb $-110, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512F-FAST-NEXT: movb $36, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm28 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm15 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} ; AVX512F-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} ; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm21 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 ; AVX512F-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm27 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 1024(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512F-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 1408(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 1344(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 1152(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf64: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm12 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm21, %zmm25 +; AVX512BW-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm13 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm29, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm14 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm22 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm9, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm10, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm9, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm9, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm10, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm28, %zmm27 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm31, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm20 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm2, %zmm5 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm28, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm15, %zmm31, %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm28, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm31, %zmm30 -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm27 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm9 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpermi2d %zmm12, %zmm3, %zmm28 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm31, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm31 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [3,11,0,8,7,15,4,12] -; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm0, %ymm14 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm21, %zmm19 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm21, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm21, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpermi2d %zmm13, %zmm15, %zmm21 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12] +; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm2, %ymm14 ; AVX512BW-SLOW-NEXT: movb $36, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm20, %zmm21, %zmm19 -; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm13[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm21, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d 128(%rcx), %ymm0, %ymm31 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm31[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512BW-SLOW-NEXT: vpermt2d 192(%rcx), %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm31 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm21 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm14[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm14, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm14, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm14, %zmm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm3, %zmm15 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm29, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm2, %ymm1 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm29, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d 128(%rcx), %ymm2, %ymm0 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm0 +; AVX512BW-SLOW-NEXT: vpermt2d 192(%rcx), %ymm2, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm7 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm29 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm29 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm31, %zmm16 ; AVX512BW-SLOW-NEXT: movb $-110, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm2, %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm8[2],zmm11[2],zmm8[3],zmm11[3],zmm8[6],zmm11[6],zmm8[7],zmm11[7],zmm8[10],zmm11[10],zmm8[11],zmm11[11],zmm8[14],zmm11[14],zmm8[15],zmm11[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm20 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15] ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm1, %zmm22 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm2, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm29 = zmm29[2],zmm4[2],zmm29[3],zmm4[3],zmm29[6],zmm4[6],zmm29[7],zmm4[7],zmm29[10],zmm4[10],zmm29[11],zmm4[11],zmm29[14],zmm4[14],zmm29[15],zmm4[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm24 = zmm24[2],zmm17[2],zmm24[3],zmm17[3],zmm24[6],zmm17[6],zmm24[7],zmm17[7],zmm24[10],zmm17[10],zmm24[11],zmm17[11],zmm24[14],zmm17[14],zmm24[15],zmm17[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm3 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm2 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm31, %zmm20, %zmm1 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm11, %zmm23 ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm17 = ymm17[2],mem[2],ymm17[3],mem[3],ymm17[6],mem[6],ymm17[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm17[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm31, %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm11, %zmm26 ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm17[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm29[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k1} = zmm24[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm10 = zmm20[2],zmm31[2],zmm20[3],zmm31[3],zmm20[6],zmm31[6],zmm20[7],zmm31[7],zmm20[10],zmm31[10],zmm20[11],zmm31[11],zmm20[14],zmm31[14],zmm20[15],zmm31[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm10[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm10 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm25 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm19 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm6 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm8 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm5 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm22 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm7 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm27 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm28 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm29 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm17, %zmm30 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm27 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm31 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm31 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm9 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm5 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm12, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm21 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm24 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm17 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm29 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm10 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm8 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm31 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm25 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm9 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm26 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm21 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm24 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm28 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm15 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 1408(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 1152(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, 1216(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-SLOW-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 1408(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm31, 1152(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 768(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, 1280(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm29, 1216(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 832(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 512(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i32_stride6_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-FAST-NEXT: subq $1160, %rsp # imm = 0x488 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm31 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm12, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512BW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm27, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm5, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm9, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm11, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm12, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm13, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm27, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm29 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm13, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512BW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm12 +; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm27, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm27, %zmm14 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm7, %zmm22 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm27 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512BW-FAST-NEXT: movb $-110, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-FAST-NEXT: movb $36, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm27 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 1024(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 960(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 832(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 1408(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 1344(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 1152(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 1024(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, 960(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 896(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 768(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 1180f88f3d118..fe216de5231a3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -617,138 +617,131 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $104, %rsp +; SSE-NEXT: subq $24, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm13 -; SSE-NEXT: movdqa 16(%rcx), %xmm9 +; SSE-NEXT: movdqa 16(%rdx), %xmm6 +; SSE-NEXT: movdqa 16(%rcx), %xmm12 ; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm4 -; SSE-NEXT: movaps 16(%r9), %xmm1 -; SSE-NEXT: movdqa (%rax), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movaps 16(%r9), %xmm0 +; SSE-NEXT: movdqa (%rax), %xmm10 +; SSE-NEXT: movaps 16(%rax), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm12[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps (%r8), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm10[3,3] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,1],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[2,0] -; SSE-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[1,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm8[0] -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[0,1],mem[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm11[2,0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm1[0],xmm6[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm4[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm1[0],xmm7[1,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm5[0],xmm13[1,2,3] +; SSE-NEXT: movaps (%rcx), %xmm0 +; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm6[0] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm14[3,3] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[2,0] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[1,3] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm15[0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm11[0] +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm10[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm14, 112(%rax) -; SSE-NEXT: movdqa %xmm15, 176(%rax) -; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm2, 64(%rax) -; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm12, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movdqa %xmm5, 176(%rax) +; SSE-NEXT: movdqa %xmm8, (%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm14, 64(%rax) +; SSE-NEXT: movaps %xmm6, 128(%rax) +; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps %xmm13, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm7, 96(%rax) +; SSE-NEXT: movaps %xmm4, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) +; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $104, %rsp +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf8: @@ -776,8 +769,8 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[1,1],xmm9[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 @@ -785,19 +778,19 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm14[1],xmm15[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm13[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] @@ -807,19 +800,19 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[0,2],ymm8[5,5],ymm9[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rax), %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3],xmm11[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[3,3],xmm11[3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 @@ -855,18 +848,18 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm8 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm6 ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm7 ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm1 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm12 ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero @@ -876,61 +869,61 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm15[3,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm10[2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm8 ; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm12 ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] @@ -946,7 +939,7 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1067,18 +1060,18 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero @@ -1088,61 +1081,61 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm15[3,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm10[2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] @@ -1158,7 +1151,7 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1280,84 +1273,86 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $536, %rsp # imm = 0x218 +; SSE-NEXT: subq $520, %rsp # imm = 0x208 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 ; SSE-NEXT: movaps (%rdx), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm8 ; SSE-NEXT: movaps 16(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm4 +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm13 +; SSE-NEXT: movdqa 16(%r9), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm11 ; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm1 @@ -1367,178 +1362,176 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps 48(%r8), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r9), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] +; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,0] +; SSE-NEXT: movaps 48(%r9), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0,1],mem[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm7[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0,1],mem[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm7[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm7[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm7[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm6[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] -; SSE-NEXT: shufps $255, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm6[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm6[0],xmm13[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm6[0],xmm9[1,2,3] -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm4, 400(%rax) -; SSE-NEXT: movaps %xmm3, 384(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm7[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm7[0],xmm12[1,2,3] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm7[0],xmm9[1,2,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm7[0],xmm8[1,2,3] +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm1, 416(%rax) +; SSE-NEXT: movaps %xmm3, 400(%rax) +; SSE-NEXT: movaps %xmm4, 384(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movdqa %xmm5, 288(%rax) -; SSE-NEXT: movaps %xmm8, 240(%rax) -; SSE-NEXT: movdqa %xmm11, 224(%rax) +; SSE-NEXT: movaps %xmm6, 240(%rax) +; SSE-NEXT: movdqa %xmm15, 224(%rax) ; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps %xmm11, 128(%rax) +; SSE-NEXT: movaps %xmm13, 112(%rax) +; SSE-NEXT: movaps %xmm14, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1547,31 +1540,32 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rax) -; SSE-NEXT: movaps %xmm9, 320(%rax) -; SSE-NEXT: movaps %xmm13, 304(%rax) +; SSE-NEXT: movaps %xmm8, 320(%rax) +; SSE-NEXT: movaps %xmm9, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) -; SSE-NEXT: movaps %xmm14, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps %xmm12, 208(%rax) +; SSE-NEXT: movaps %xmm2, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm2, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $536, %rsp # imm = 0x218 +; SSE-NEXT: addq $520, %rsp # imm = 0x208 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 @@ -1581,12 +1575,11 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] @@ -1596,16 +1589,16 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm4[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 @@ -1616,7 +1609,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1624,21 +1617,23 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[1,1],xmm10[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[1,1],xmm11[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1],xmm4[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[1,1],xmm4[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm15[1],xmm14[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1],ymm8[1,1],ymm6[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1],ymm9[1,1],ymm6[5,5],ymm9[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm7[1,1],ymm13[5,5],ymm7[5,5] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm8[1,1],ymm7[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm13 +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] @@ -1648,20 +1643,22 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1],xmm0[0,2] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm14[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1],xmm0[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm2[1],zero -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1],xmm14[1,1] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1669,73 +1666,71 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm5[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,1],ymm10[1,1],ymm8[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm10[1,1],ymm9[5,5],ymm10[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm6[1,1],ymm7[5,5],ymm6[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm2[2,1],ymm4[6,4],ymm2[6,5] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm15[2,1],ymm2[6,4],ymm15[6,5] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,1],ymm3[0,2],ymm1[5,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[0,2],ymm1[5,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm13[3,3],mem[3,3],ymm13[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,3],ymm11[3,3],ymm13[7,7],ymm11[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm13[3,3],mem[3,3],ymm13[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm6[3,3],ymm11[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,3],ymm8[3,3],ymm10[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm1[3,3],ymm2[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3],ymm1[1,2],ymm4[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3],ymm1[3,3],ymm15[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3],ymm1[1,2],ymm5[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -1743,42 +1738,43 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3],xmm11[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm14[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm12[2],mem[2],xmm12[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,1],ymm3[0,2],ymm13[7,5],ymm3[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm5[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3],xmm6[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,1],ymm4[0,2],ymm6[7,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm5[0,2],ymm7[7,5],ymm5[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm14[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[3,3],xmm14[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] @@ -1786,12 +1782,10 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1808,104 +1802,104 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm10 -; AVX2-SLOW-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm6 +; AVX2-SLOW-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm8 +; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm14 -; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm9 +; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm11[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[1],xmm1[1],zero -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm6 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 60(%r8), %ymm1 @@ -1915,9 +1909,9 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -1925,125 +1919,125 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm3 -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm7, %ymm2 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastsd %xmm15, %ymm3 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm12[3,3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovaps %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm11, %ymm2 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastsd %xmm12, %ymm5 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm13 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm0[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm10[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm0[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2060,7 +2054,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2068,52 +2062,52 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $536, %rsp # imm = 0x218 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps (%rax), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rax), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %xmm7 +; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm6[1],xmm9[1],zero +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm10[1],xmm11[1],zero ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm2[1],xmm11[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm8[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 @@ -2135,20 +2129,20 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm12 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm4 ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm15 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm15, %ymm15 @@ -2156,7 +2150,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2168,132 +2162,133 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm6[3,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm7, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm0 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm1 -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm6 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm15, %ymm5 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm2 +; AVX2-FAST-NEXT: vbroadcastss %xmm10, %xmm10 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastsd %xmm14, %ymm9 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm11[1,1],ymm2[5,5],ymm11[5,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm6[3,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm4 +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm4 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm4 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastsd %xmm6, %ymm4 ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm1[1,1],ymm3[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm1[1,1],ymm7[5,5],ymm1[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2],ymm9[3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] ; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovaps %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[3,3],ymm7[3,3],ymm2[7,7],ymm7[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm14[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[3,3],ymm10[3,3],ymm2[7,7],ymm10[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm2[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm15[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2320,98 +2315,98 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm11[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[1],xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r8), %ymm1 @@ -2421,9 +2416,9 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -2431,125 +2426,125 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm12[3,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm10[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm0[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2566,7 +2561,7 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2824,71 +2819,70 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $1256, %rsp # imm = 0x4E8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm10 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm15 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movaps (%rdx), %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm13 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -2898,23 +2892,22 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm8 +; SSE-NEXT: movdqa 48(%rsi), %xmm3 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm13 +; SSE-NEXT: movaps 48(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r9), %xmm1 @@ -2924,24 +2917,26 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%r9), %xmm1 @@ -2951,17 +2946,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm12 +; SSE-NEXT: movdqa 80(%rsi), %xmm3 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2979,37 +2974,38 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm4 -; SSE-NEXT: movaps 96(%rdx), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movdqa 96(%rsi), %xmm6 +; SSE-NEXT: movaps 96(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm3 -; SSE-NEXT: movaps 96(%r8), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r8), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rax), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 96(%rax), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm12, %xmm0 @@ -3021,225 +3017,227 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movaps 112(%rcx), %xmm2 ; SSE-NEXT: movaps 112(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: movaps 112(%r9), %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rax), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] +; SSE-NEXT: movaps 112(%rax), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[0,2] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[0,3] +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,1],mem[0,3] +; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm14, %xmm10 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[0,3] -; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[3,3],mem[3,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm12[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] @@ -3247,12 +3245,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -3289,11 +3286,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] @@ -3304,11 +3302,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] @@ -3333,19 +3331,19 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 864(%rax) -; SSE-NEXT: movaps %xmm5, 848(%rax) +; SSE-NEXT: movaps %xmm5, 864(%rax) +; SSE-NEXT: movaps %xmm7, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: movaps %xmm4, 784(%rax) -; SSE-NEXT: movaps %xmm7, 736(%rax) -; SSE-NEXT: movaps %xmm8, 688(%rax) -; SSE-NEXT: movaps %xmm9, 672(%rax) -; SSE-NEXT: movaps %xmm10, 624(%rax) -; SSE-NEXT: movaps %xmm11, 576(%rax) -; SSE-NEXT: movaps %xmm13, 560(%rax) +; SSE-NEXT: movaps %xmm6, 784(%rax) +; SSE-NEXT: movaps %xmm8, 736(%rax) +; SSE-NEXT: movaps %xmm9, 688(%rax) +; SSE-NEXT: movaps %xmm10, 672(%rax) +; SSE-NEXT: movaps %xmm11, 624(%rax) +; SSE-NEXT: movaps %xmm13, 576(%rax) +; SSE-NEXT: movaps %xmm15, 560(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3393,13 +3391,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 608(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 592(%rax) -; SSE-NEXT: movaps %xmm6, 544(%rax) +; SSE-NEXT: movaps %xmm3, 544(%rax) ; SSE-NEXT: movaps %xmm12, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rax) -; SSE-NEXT: movaps %xmm15, 432(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3416,8 +3415,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) +; SSE-NEXT: movaps %xmm4, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3435,27 +3433,27 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1656, %rsp # imm = 0x678 +; AVX1-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 @@ -3472,11 +3470,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -3491,7 +3489,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3508,10 +3506,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3521,18 +3518,18 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -3551,11 +3548,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3565,11 +3562,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[1,1],ymm11[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3585,25 +3582,26 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3611,34 +3609,34 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm13[1,1],ymm14[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm9[1,1],ymm2[5,5],ymm9[5,5] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,1],ymm1[6,4],ymm10[6,5] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[2,1],ymm1[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -3646,93 +3644,96 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm7[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm6[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm11[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm3[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm3[1],xmm4[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[0,2],ymm8[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[0,2],ymm11[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm14 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[0,2],ymm12[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[0,2],ymm10[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm7[3,3],ymm14[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm2[3,3],ymm6[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm15[3,3],ymm8[7,7],ymm15[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 124(%r8), %ymm1 @@ -3742,9 +3743,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,1],ymm0[0,2],ymm14[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[4],ymm15[4],ymm8[5],ymm15[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 108(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -3753,8 +3757,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm14[1,1],ymm7[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm6[1,1],ymm2[5,5],ymm6[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm2[1,1],ymm3[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm8[1,1],ymm15[5,5],ymm8[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%r8), %ymm1 @@ -3767,10 +3771,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3781,14 +3785,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm4[3,3],ymm3[7,7],ymm4[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm12[3,3],ymm4[7,7],ymm12[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm5[3,3],ymm11[7,7],ymm5[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3796,138 +3798,139 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6],ymm13[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm13[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[3,3],ymm14[3,3],ymm5[7,7],ymm14[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,3],ymm7[3,3],ymm12[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,3],ymm11[3,3],ymm0[7,7],ymm11[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[2,3],ymm11[1,2],ymm0[6,7],ymm11[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm13[1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3],ymm12[3,3],ymm11[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm6[3,3],ymm10[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm6[3,3],ymm8[3,3],ymm6[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,3],ymm14[3,3],ymm6[7,7],ymm14[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm14[3,3],ymm4[7,7],ymm14[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,3],ymm0[3,3],ymm1[7,7],ymm0[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3],ymm4[1,2],ymm2[6,7],ymm4[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[3,1],ymm5[0,2],ymm9[7,5],ymm5[4,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,1],ymm3[0,2],ymm11[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm13[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[4],ymm14[4],ymm4[5],ymm14[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm5[0,2],ymm6[7,5],ymm5[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm9[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[4],ymm7[4],ymm12[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1],ymm7[0,2],ymm9[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1],ymm9[0,2],ymm6[7,5],ymm9[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[3,3],xmm13[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm6, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 640(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 512(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) @@ -3963,7 +3966,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) -; AVX1-ONLY-NEXT: addq $1656, %rsp # imm = 0x678 +; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3977,42 +3980,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm14 -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm6 -; AVX2-SLOW-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm10 ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm9 ; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm12[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm10[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm13 ; AVX2-SLOW-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm11[1],zero +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4034,16 +4037,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm13[1],zero +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm13[1],xmm2[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm11 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1 @@ -4058,7 +4062,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero @@ -4088,11 +4092,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4139,10 +4143,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm9 -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] @@ -4151,146 +4154,146 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastss 112(%rax), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps %xmm14, %xmm9 +; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm14, %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm6, %xmm4 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm11[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm4 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm10[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 108(%r8), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = mem[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 108(%r8), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = mem[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rax), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3,4],ymm0[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm7[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -4306,12 +4309,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm5[1,1],ymm10[5,5],ymm5[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] @@ -4328,124 +4331,125 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm2[3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm4[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm3[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm2[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0],ymm9[1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm3[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3,4],ymm7[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm2[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = xmm5[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 544(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 416(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 608(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 576(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 544(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 608(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 384(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 160(%rax) @@ -4489,15 +4493,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-FAST-LABEL: store_i32_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1400, %rsp # imm = 0x578 +; AVX2-FAST-NEXT: subq $1416, %rsp # imm = 0x588 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm1 @@ -4505,30 +4509,29 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm5 ; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm6[1],zero +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm11 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 @@ -4538,8 +4541,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm2[1],xmm3[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm6[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r8), %xmm1 @@ -4560,20 +4563,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm3[1],xmm2[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm7 +; AVX2-FAST-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 96(%rax), %xmm1 @@ -4587,35 +4590,37 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm3[1],xmm2[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm10 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm8 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4,5],ymm6[6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm10 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[6],ymm10[6],ymm6[7],ymm10[7] +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 @@ -4637,8 +4642,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 @@ -4647,11 +4652,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 64(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4660,51 +4665,51 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6],ymm15[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm5[1,1],ymm1[1,1],ymm5[5,5],ymm1[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm15[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0],ymm3[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vbroadcastss 112(%r9), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 112(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm15[2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm14 +; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm3 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vbroadcastss 108(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps 96(%r9), %ymm11, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps 96(%r9), %ymm12, %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1,2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vmovaps 96(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -4715,44 +4720,45 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm14, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovaps %xmm11, %xmm15 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd (%rsp), %ymm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm15[3,3],xmm13[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[3,3],xmm15[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps %xmm5, %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm10[1,1],ymm6[5,5],ymm10[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -4767,11 +4773,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] @@ -4780,192 +4786,194 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm5[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1],ymm11[1,1],ymm3[5,5],ymm11[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[1,1],mem[1,1],ymm0[5,5],mem[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm4 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm0[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm0[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 80(%rax), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm10[1,1],ymm13[5,5],ymm10[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 80(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm6 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm8 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm3 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm0[3,3],xmm2[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm9 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6],ymm0[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm6[2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm1[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm3[2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6],ymm9[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0],ymm9[1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[6],ymm10[6],ymm1[7],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6],ymm10[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm9[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[6],ymm1[6],ymm7[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm10 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm10[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[6],ymm12[6],ymm0[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4],ymm4[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm11 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm5[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm0[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm11, 544(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 416(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 320(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 736(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm8, 672(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm8, 544(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 736(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 608(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5006,7 +5014,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 800(%rax) -; AVX2-FAST-NEXT: addq $1400, %rsp # imm = 0x578 +; AVX2-FAST-NEXT: addq $1416, %rsp # imm = 0x588 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5020,42 +5028,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm12[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm10[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5077,16 +5085,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm13[1],xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1 @@ -5101,7 +5110,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero @@ -5131,11 +5140,11 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5182,10 +5191,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] @@ -5194,146 +5202,146 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rax), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm11[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3],xmm10[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 108(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = mem[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 108(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = mem[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm6[1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3,4],ymm0[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -5349,12 +5357,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm5[1,1],ymm10[5,5],ymm5[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] @@ -5371,124 +5379,125 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm2[3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm3[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm2[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0],ymm9[1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,3],ymm13[3,3],ymm11[7,7],ymm13[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3,4],ymm7[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm3[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[6],ymm12[6],ymm14[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3,4],ymm7[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm2[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm5[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3,4],ymm13[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 640(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 608(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 576(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 544(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 608(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 576(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 160(%rax) @@ -5534,200 +5543,201 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F: # %bb.0: ; AVX512F-NEXT: pushq %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm10 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm23 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512F-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm27, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm27, %zmm28 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm28 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm18 {%k2} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm30 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512F-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm18 {%k2} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512F-NEXT: vpermi2d %zmm9, %zmm2, %zmm28 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512F-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512F-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm16 {%k2} -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermi2d %zmm9, %zmm22, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm31 +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm16 {%k2} +; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512F-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermt2d %zmm27, %zmm25, %zmm31 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512F-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm28, %zmm24 -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm15, %zmm14, %zmm17 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512F-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm4 {%k2} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512F-NEXT: vpermt2d %zmm20, %zmm24, %zmm19 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vpermt2d %zmm27, %zmm22, %zmm19 +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} +; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512F-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512F-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm4 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm19, %zmm29 -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512F-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm22, %zmm24 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm2 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vpermt2d %zmm27, %zmm24, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} +; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm21 {%k2} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm19 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm25 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm25 {%k1} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm24, %zmm22 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm19, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm19, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512F-NEXT: vpermt2d %zmm11, %zmm10, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vpermt2d %zmm8, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vpermt2d %zmm9, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} +; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} ; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm20, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm20, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm27, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512F-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm27, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 640(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512F-NEXT: popq %rax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5736,200 +5746,201 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm28 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm27, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm27, %zmm28 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm18 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm30 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm2, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 ; AVX512BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 ; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm22, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm31 +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm16 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 ; AVX512BW-NEXT: movw $-30962, %ax # imm = 0x870E ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm25, %zmm31 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 ; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm28, %zmm24 -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm15, %zmm14, %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm4 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm24, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 ; AVX512BW-NEXT: movw $28897, %ax # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm4 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm19, %zmm29 -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 ; AVX512BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm24, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 ; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm21 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm25 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm10, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} ; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm27, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm27, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -5960,69 +5971,69 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movaps (%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdx), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm9 -; SSE-NEXT: movaps 16(%rcx), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm13 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -6032,8 +6043,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6046,8 +6057,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 48(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r8), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6058,8 +6069,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6073,8 +6084,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 64(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6085,8 +6096,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6111,8 +6122,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6124,11 +6135,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rcx), %xmm10 ; SSE-NEXT: movaps 96(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm1 @@ -6138,8 +6148,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6164,8 +6174,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6180,7 +6190,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 128(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6309,21 +6319,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm8 +; SSE-NEXT: movaps 208(%rcx), %xmm6 ; SSE-NEXT: movaps 208(%r8), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rax), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 208(%r9), %xmm6 +; SSE-NEXT: movdqa 208(%rax), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6331,10 +6342,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rsi), %xmm0 @@ -6343,22 +6354,22 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm3 +; SSE-NEXT: movaps 224(%rcx), %xmm4 ; SSE-NEXT: movaps 224(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 224(%r9), %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps 224(%r9), %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps 224(%rax), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6372,21 +6383,21 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 240(%rcx), %xmm8 +; SSE-NEXT: movaps 240(%rcx), %xmm6 ; SSE-NEXT: movaps 240(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; SSE-NEXT: movaps 240(%r9), %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps 240(%rax), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6395,9 +6406,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -6407,139 +6418,187 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[1,3] +; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6553,21 +6612,21 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload @@ -6578,17 +6637,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -6601,77 +6660,28 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] @@ -6681,8 +6691,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] @@ -6700,13 +6710,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -6715,178 +6725,197 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm14[3,3] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: movaps %xmm3, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 224(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movaps 240(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,0] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm11[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[3,3],mem[3,3] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm12[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] @@ -6932,30 +6961,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] @@ -6984,19 +6995,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm4[0],xmm12[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm4[0],xmm11[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] ; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7007,23 +7018,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 1760(%rax) -; SSE-NEXT: movaps %xmm11, 1744(%rax) +; SSE-NEXT: movaps %xmm7, 1744(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1728(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1696(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1696(%rax) ; SSE-NEXT: movaps %xmm5, 1680(%rax) -; SSE-NEXT: movaps %xmm6, 1648(%rax) -; SSE-NEXT: movaps %xmm7, 1632(%rax) +; SSE-NEXT: movaps %xmm8, 1648(%rax) +; SSE-NEXT: movaps %xmm9, 1632(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1616(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1584(%rax) -; SSE-NEXT: movaps %xmm9, 1568(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1584(%rax) +; SSE-NEXT: movaps %xmm10, 1568(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1536(%rax) ; SSE-NEXT: movaps %xmm13, 1520(%rax) -; SSE-NEXT: movaps %xmm10, 1472(%rax) +; SSE-NEXT: movaps %xmm12, 1472(%rax) ; SSE-NEXT: movaps %xmm14, 1456(%rax) ; SSE-NEXT: movaps %xmm15, 1408(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7042,101 +7053,101 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 1136(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1120(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1072(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1024(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1008(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 960(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 912(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 896(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 848(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 784(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 736(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 688(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 672(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 624(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 576(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 560(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 512(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 464(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 448(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1776(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1712(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1664(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1600(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1552(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1504(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1440(%rax) -; SSE-NEXT: movaps %xmm8, 1424(%rax) +; SSE-NEXT: movaps %xmm1, 1072(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1392(%rax) +; SSE-NEXT: movaps %xmm1, 1024(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1376(%rax) -; SSE-NEXT: movaps %xmm12, 1328(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1312(%rax) +; SSE-NEXT: movaps %xmm1, 1008(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 960(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 912(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 896(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 848(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 800(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 784(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 736(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 688(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 672(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 624(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 576(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 560(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 512(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 464(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 448(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 336(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 288(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1280(%rax) +; SSE-NEXT: movaps %xmm1, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1264(%rax) +; SSE-NEXT: movaps %xmm1, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1776(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1712(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1664(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1600(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1552(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1504(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 1488(%rax) +; SSE-NEXT: movaps %xmm4, 1440(%rax) +; SSE-NEXT: movaps %xmm6, 1424(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1392(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1376(%rax) +; SSE-NEXT: movaps %xmm11, 1328(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1312(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1280(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1264(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1216(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1200(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1168(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1152(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1168(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1152(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7145,7 +7156,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 1056(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1040(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 992(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 976(%rax) @@ -7161,7 +7172,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 816(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 752(%rax) @@ -7179,7 +7190,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 592(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%rax) -; SSE-NEXT: movaps %xmm2, 528(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7194,8 +7206,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 304(%rax) +; SSE-NEXT: movaps %xmm2, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7221,40 +7232,41 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3416, %rsp # imm = 0xD58 +; AVX1-ONLY-NEXT: subq $3432, %rsp # imm = 0xD68 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7266,18 +7278,18 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] @@ -7289,17 +7301,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[1,1],ymm12[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7309,26 +7320,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7336,28 +7347,28 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm10[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm8[1,1],ymm1[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm10[1,1],ymm1[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7376,16 +7387,15 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 @@ -7404,10 +7414,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm9[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7417,16 +7427,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7436,26 +7447,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7463,12 +7474,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm6[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] @@ -7488,8 +7499,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7508,7 +7519,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7516,9 +7527,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7526,14 +7537,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7543,11 +7554,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7563,26 +7574,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7590,34 +7599,34 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm3[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm1[1,1],ymm6[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm9[2,1],ymm1[6,4],ymm9[6,5] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,1],ymm1[6,4],ymm11[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -7626,24 +7635,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7651,193 +7660,194 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm4[1],xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm7[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm4[1,1],ymm5[5,5],ymm4[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[1,1],ymm7[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm14[1,1],ymm7[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm11[2,1],ymm2[6,4],ymm11[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,1],ymm12[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm4[0,2],ymm8[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm1[0,2],ymm15[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[0,2],ymm10[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm14[1],ymm4[3],ymm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm4[0,2],ymm14[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm1[0,2],ymm13[5,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1],ymm4[0,2],ymm6[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm4[0,2],ymm13[5,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[6],ymm14[6],ymm7[7],ymm14[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm3[1],ymm11[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1],xmm0[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[1],xmm4[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm12[1],xmm4[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[3,3],xmm2[3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 232(%r9), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 232(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 232(%r9), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 232(%rax), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm7[3,3],ymm14[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,3],ymm5[3,3],ymm6[7,7],ymm5[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 220(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 220(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm7[3,3],ymm11[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 220(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 220(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%rax), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm11[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] @@ -7849,12 +7859,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm3[3,3],ymm4[7,7],ymm3[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm3[3,3],ymm8[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm2[3,3],ymm6[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm2[3,3],ymm4[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 252(%r8), %ymm1 @@ -7864,9 +7874,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,1],ymm0[0,2],ymm8[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 236(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -7875,8 +7885,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm4[1,1],ymm3[5,5],ymm4[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm6[1,1],ymm2[5,5],ymm6[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm4[1,1],ymm2[5,5],ymm4[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 240(%r8), %ymm1 @@ -7889,10 +7899,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7903,13 +7913,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[3,3],mem[3,3],ymm5[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm7[3,3],ymm1[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7942,9 +7951,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7974,11 +7983,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm8[3,3],ymm0[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8008,11 +8018,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm12[3,3],ymm0[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm4 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm15[3,3],ymm1[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8041,170 +8051,170 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[3,3],mem[3,3],ymm14[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3],ymm1[3,3],ymm12[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm15[3,3],ymm12[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm11[3,3],ymm2[7,7],ymm11[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3],ymm1[1,2],ymm3[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,3],ymm11[3,3],ymm10[7,7],ymm11[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3],ymm10[3,3],ymm9[7,7],ymm10[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3],ymm9[3,3],ymm0[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,3],mem[3,3],ymm3[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3],ymm3[1,2],ymm5[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,1],ymm2[0,2],ymm6[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,1],ymm3[0,2],ymm6[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[3,1],ymm12[0,2],ymm14[7,5],ymm12[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm1[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[4],ymm11[4],ymm2[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1],ymm12[0,2],ymm2[7,5],ymm12[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1],ymm14[0,2],ymm1[7,5],ymm14[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[4],ymm9[4],ymm0[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm10[3,1],ymm15[0,2],ymm10[7,5],ymm15[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm13[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm14[0,2],ymm1[7,5],ymm14[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm5[3,3],xmm8[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm13[0,2],ymm0[7,5],ymm13[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm7[3,3],xmm8[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1440(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 1216(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 992(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1440(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 992(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8303,42 +8313,41 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX1-ONLY-NEXT: addq $3416, %rsp # imm = 0xD58 +; AVX1-ONLY-NEXT: addq $3432, %rsp # imm = 0xD68 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $3000, %rsp # imm = 0xBB8 +; AVX2-SLOW-NEXT: subq $2968, %rsp # imm = 0xB98 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm13 ; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm7 -; AVX2-SLOW-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-SLOW-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm10 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm9[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] @@ -8346,15 +8355,15 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm3[1],zero +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8376,10 +8385,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm11[1],xmm2[1],zero +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8410,9 +8420,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 @@ -8510,11 +8520,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8625,11 +8635,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8652,319 +8662,317 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastss %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 224(%r8), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 224(%r8), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-SLOW-NEXT: vbroadcastss %xmm3, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm3, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2],xmm14[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 240(%rax), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 240(%rax), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm13 -; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm14 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovaps %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm5 -; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm14[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm13[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm1[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm6 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vbroadcastsd 200(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastsd 200(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 220(%r8), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 220(%r9), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 216(%rax), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 220(%r8), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 220(%r9), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 216(%rax), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = mem[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm7[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovaps 224(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm6[2,3] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -8975,33 +8983,33 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm7[1,1],ymm1[5,5],ymm7[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9018,9 +9026,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX2-SLOW-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9034,8 +9042,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -9090,9 +9098,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm4[1,1],ymm0[5,5],ymm4[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -9103,44 +9111,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3],ymm12[3,3],ymm9[7,7],ymm12[7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -9150,25 +9157,27 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9177,14 +9186,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -9194,103 +9202,104 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm5[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3,4],ymm8[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[4],ymm10[4],ymm1[5],ymm10[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm5[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = xmm7[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3,4],ymm8[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm1[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm10[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm1[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm8, 1440(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 1440(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 1216(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 1088(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 992(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 864(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 1216(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 1088(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 992(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 864(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 640(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm15, 544(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 416(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm12, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 1472(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 1472(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9373,13 +9382,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-SLOW-NEXT: addq $3000, %rsp # imm = 0xBB8 +; AVX2-SLOW-NEXT: addq $2968, %rsp # imm = 0xB98 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $3096, %rsp # imm = 0xC18 +; AVX2-FAST-NEXT: subq $3080, %rsp # imm = 0xC08 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9569,28 +9578,28 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[6],ymm1[6],ymm11[7],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm15 +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9609,11 +9618,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9632,11 +9641,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9655,11 +9664,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9678,11 +9687,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9702,10 +9711,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm9 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm12 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] +; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 @@ -9737,31 +9746,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vbroadcastss 228(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vinsertf128 $1, 224(%rax), %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm2 ; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm5 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastss %xmm4, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastss 224(%rax), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] @@ -9774,51 +9783,51 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6],ymm15[7] -; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vbroadcastss 240(%r9), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 240(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6],ymm11[7] +; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vbroadcastss 240(%r9), %xmm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 240(%rax), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[6],ymm13[6],ymm9[7],ymm13[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 220(%r9), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastsd 216(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm14 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 220(%r9), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcastsd 216(%rax), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6],ymm14[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm14 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm14, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm11, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm14[2,3],ymm15[2,3] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm14[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] @@ -9838,25 +9847,25 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9867,43 +9876,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm11[1,1],mem[1,1],ymm11[5,5],mem[5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -9927,37 +9936,37 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9975,43 +9984,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10023,39 +10032,39 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 136(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -10071,31 +10080,31 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 168(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -10105,9 +10114,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10119,42 +10128,42 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 200(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -10168,12 +10177,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10182,178 +10191,175 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm3[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm3 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[4],ymm15[4],ymm2[5],ymm15[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm4[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm5[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm5[1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0],ymm12[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm3[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm5[1,2],ymm9[3,4],ymm5[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6],ymm9[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm1[6],ymm9[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm9 = xmm8[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3,4],ymm10[5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6],ymm13[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = xmm8[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = xmm13[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm8[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2],ymm8[3,4],ymm9[5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm1[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm7, 1440(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 1440(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 1216(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm3, 1088(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm12, 992(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm11, 864(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 768(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 640(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm15, 544(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 1216(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 1088(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, 992(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 864(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 768(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 544(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10444,42 +10450,41 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovaps %ymm0, 1568(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX2-FAST-NEXT: addq $3096, %rsp # imm = 0xC18 +; AVX2-FAST-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $3000, %rsp # imm = 0xBB8 +; AVX2-FAST-PERLANE-NEXT: subq $2968, %rsp # imm = 0xB98 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm9[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] @@ -10487,15 +10492,15 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10517,10 +10522,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm11[1],xmm2[1],zero +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm3[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10551,9 +10557,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 @@ -10651,11 +10657,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10766,11 +10772,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10793,319 +10799,317 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 224(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 224(%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm3, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm3, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm1[1],xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm4[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rax), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rax), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm14[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm13[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm1[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm9[3,3],xmm1[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 200(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 200(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r9), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%rax), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r9), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%rax), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = mem[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm7[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11116,33 +11120,33 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm7[1,1],ymm1[5,5],ymm7[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11159,9 +11163,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11175,8 +11179,8 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11231,9 +11235,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm4[1,1],ymm0[5,5],ymm4[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -11244,44 +11248,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3],ymm12[3,3],ymm9[7,7],ymm12[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11291,24 +11294,26 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11318,14 +11323,13 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -11335,103 +11339,104 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm5[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3,4],ymm8[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[4],ymm10[4],ymm1[5],ymm10[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm5[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm7[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3,4],ymm8[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm1[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm10[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm1[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 1440(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 1440(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 1216(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 1088(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 992(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 864(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 1216(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 1088(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 992(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 864(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 640(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1472(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1504(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 1472(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11514,943 +11519,959 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $3000, %rsp # imm = 0xBB8 +; AVX2-FAST-PERLANE-NEXT: addq $2968, %rsp # imm = 0xB98 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: store_i32_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3016, %rsp # imm = 0xBC8 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm13 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512F-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermt2d %zmm22, %zmm31, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm22, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm19, %zmm31, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm3, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm3, %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm3, %zmm13, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm13, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm9, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm29 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm29, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm21 ; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm23, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm3, %zmm23, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm23, %zmm3, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm7, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm21, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm21, %zmm30 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm25, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm24 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm2, %zmm29, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm2, %zmm29, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm6, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm5 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm12, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm30, %zmm13 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm12, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm12, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm29, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm23 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm12, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm12, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm12 ; AVX512F-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} ; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512F-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm10, %zmm29 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm10, %zmm29 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512F-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm7, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512F-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm7, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k2} +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm30 {%k2} ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512F-NEXT: kmovw %ecx, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512F-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512F-NEXT: kmovw %ecx, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} ; AVX512F-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm27 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vpermi2d %zmm10, %zmm7, %zmm21 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm9 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm10 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm28 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vpermi2d %zmm10, %zmm7, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermi2d %zmm7, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm7, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512F-NEXT: vpermi2d %zmm7, %zmm29, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vpermi2d %zmm7, %zmm13, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm12 {%k1} ; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermi2d %zmm10, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512F-NEXT: vpermi2d %zmm10, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512F-NEXT: vpermi2d %zmm10, %zmm15, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm17, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm17, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm8, %zmm15, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512F-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512F-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512F-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512F-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1408(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm31, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm30, 1216(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 896(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, 704(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 1536(%rax) -; AVX512F-NEXT: addq $3016, %rsp # imm = 0xBC8 +; AVX512F-NEXT: vmovdqa64 %zmm21, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 256(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1536(%rax) +; AVX512F-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3016, %rsp # imm = 0xBC8 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512BW-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm31, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm22, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm31, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm13, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm13, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm29 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm29, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm21 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm23, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm23, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm23, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm21, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm21, %zmm30 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm25, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm29, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm29, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm6, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm5 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm13 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm12, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm23 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm12 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} ; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm8 ; AVX512BW-NEXT: movw $28897, %cx # imm = 0x70E1 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm10, %zmm29 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm29 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1} ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $7224, %cx # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: movw $-30962, %cx # imm = 0x870E ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm30 {%k2} ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C ; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm1 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} ; AVX512BW-NEXT: movw $12384, %cx # imm = 0x3060 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm27 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm18 {%k3} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm7, %zmm21 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 ; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm14, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm7, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm29, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vpermi2d %zmm7, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm12 {%k1} ; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm13, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm15, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm15, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm15, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm13, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm14, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15] +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,25,6,7,8,9,10,11,26,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm12, %zmm11 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1408(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1216(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1536(%rax) -; AVX512BW-NEXT: addq $3016, %rsp # imm = 0xBC8 +; AVX512BW-NEXT: vmovdqa64 %zmm21, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 256(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1536(%rax) +; AVX512BW-NEXT: addq $3080, %rsp # imm = 0xC08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index 69d8fa57cd482..f2b18eb113bbc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -352,133 +352,134 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: subq $72, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps (%rdi), %xmm8 ; SSE-NEXT: movaps (%rsi), %xmm0 ; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%r8), %xmm15 +; SSE-NEXT: movaps 16(%r8), %xmm10 ; SSE-NEXT: movaps (%r9), %xmm1 ; SSE-NEXT: movaps (%r10), %xmm14 -; SSE-NEXT: movaps 16(%r10), %xmm15 +; SSE-NEXT: movaps 16(%r10), %xmm12 ; SSE-NEXT: movaps (%rax), %xmm4 -; SSE-NEXT: movaps 16(%rax), %xmm10 +; SSE-NEXT: movaps 16(%rax), %xmm7 ; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] -; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm15, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps 16(%r9), %xmm10 -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: movaps 16(%r9), %xmm7 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps 16(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps 16(%rdx), %xmm6 +; SSE-NEXT: movaps 16(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm6[0] ; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE-NEXT: movaps %xmm15, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm10[0,2] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm14[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm4[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm5[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm9[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm3[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm9[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm9[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm12[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm2, 224(%rax) -; SSE-NEXT: movaps %xmm7, 240(%rax) -; SSE-NEXT: movaps %xmm3, 160(%rax) +; SSE-NEXT: movaps %xmm10, 240(%rax) +; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm4, 96(%rax) -; SSE-NEXT: movaps %xmm6, 112(%rax) +; SSE-NEXT: movaps %xmm14, 96(%rax) +; SSE-NEXT: movaps %xmm15, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm10, 192(%rax) -; SSE-NEXT: movaps %xmm11, 208(%rax) +; SSE-NEXT: movaps %xmm5, 48(%rax) +; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: addq $72, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf8: @@ -827,9 +828,9 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm7 ; SSE-NEXT: movaps (%r10), %xmm5 -; SSE-NEXT: movaps (%rax), %xmm7 +; SSE-NEXT: movaps (%rax), %xmm6 ; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] ; SSE-NEXT: movaps %xmm9, %xmm13 @@ -837,10 +838,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm13, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] ; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -848,7 +849,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -859,17 +860,17 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rcx), %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] ; SSE-NEXT: movaps 16(%r10), %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm2 @@ -916,109 +917,110 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm1 -; SSE-NEXT: movaps 32(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps 32(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm0 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps 32(%r8), %xmm9 +; SSE-NEXT: movaps 32(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm14, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r10), %xmm1 +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: movaps %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: movaps %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] ; SSE-NEXT: movaps 48(%rdx), %xmm1 -; SSE-NEXT: movaps 48(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps 48(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] ; SSE-NEXT: movaps 48(%r10), %xmm0 -; SSE-NEXT: movaps 48(%rax), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps 48(%r8), %xmm3 -; SSE-NEXT: movaps 48(%r9), %xmm13 -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] -; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movaps 48(%r8), %xmm5 +; SSE-NEXT: movaps 48(%r9), %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[2,0] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm2[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps %xmm3, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 496(%rax) -; SSE-NEXT: movaps %xmm4, 480(%rax) -; SSE-NEXT: movaps %xmm13, 464(%rax) +; SSE-NEXT: movaps %xmm5, 496(%rax) +; SSE-NEXT: movaps %xmm3, 480(%rax) +; SSE-NEXT: movaps %xmm9, 464(%rax) ; SSE-NEXT: movaps %xmm12, 448(%rax) -; SSE-NEXT: movaps %xmm8, 432(%rax) -; SSE-NEXT: movaps %xmm5, 416(%rax) +; SSE-NEXT: movaps %xmm6, 432(%rax) +; SSE-NEXT: movaps %xmm4, 416(%rax) ; SSE-NEXT: movaps %xmm10, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%rax) -; SSE-NEXT: movaps %xmm9, 368(%rax) +; SSE-NEXT: movaps %xmm15, 384(%rax) +; SSE-NEXT: movaps %xmm11, 368(%rax) ; SSE-NEXT: movaps %xmm14, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps %xmm11, 304(%rax) -; SSE-NEXT: movaps %xmm15, 288(%rax) +; SSE-NEXT: movaps %xmm13, 304(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1113,17 +1115,17 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm13[2,3],ymm5[6,4],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,0],ymm13[2,3],ymm5[6,4],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm5 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] @@ -1134,7 +1136,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] @@ -1145,7 +1147,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] @@ -1156,7 +1158,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[4],ymm5[4],ymm12[5],ymm5[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5] @@ -1167,115 +1169,115 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm15[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1302,108 +1304,108 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i32_stride8_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm13 -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] +; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm9[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm11[0],ymm5[1],ymm11[1],ymm5[4],ymm11[4],ymm5[5],ymm11[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm4 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[4],ymm1[4],ymm6[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 @@ -1419,74 +1421,75 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm15, %xmm1 +; AVX2-ONLY-NEXT: vbroadcastss %xmm14, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 ; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm9 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm1[0,1,2],xmm13[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm2[3] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm5 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -1496,8 +1499,8 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm13[2],mem[2],xmm13[3],mem[3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 @@ -1505,10 +1508,10 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -1520,13 +1523,14 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) @@ -1544,7 +1548,7 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1907,21 +1911,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm1 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 32(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r10), %xmm2 +; SSE-NEXT: movaps 32(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -1929,48 +1933,48 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm1 -; SSE-NEXT: movaps 48(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 48(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r10), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -1978,48 +1982,48 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 64(%rdi), %xmm8 -; SSE-NEXT: movaps 64(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r10), %xmm1 -; SSE-NEXT: movaps 64(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 64(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps 64(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%r10), %xmm2 +; SSE-NEXT: movaps 64(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -2027,48 +2031,48 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 80(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm1 -; SSE-NEXT: movaps 80(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 80(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdi), %xmm7 +; SSE-NEXT: movaps 80(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%r10), %xmm2 +; SSE-NEXT: movaps 80(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -2076,135 +2080,136 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm0 -; SSE-NEXT: movaps 96(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 96(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm1 -; SSE-NEXT: movaps 96(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 96(%r8), %xmm7 -; SSE-NEXT: movaps 96(%r9), %xmm9 -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm1[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm2 -; SSE-NEXT: movaps 112(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movaps 96(%rdx), %xmm2 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 112(%rsi), %xmm13 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] -; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movaps 96(%rdi), %xmm15 +; SSE-NEXT: movaps 96(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r10), %xmm1 -; SSE-NEXT: movaps 112(%rax), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: movaps 112(%r8), %xmm3 -; SSE-NEXT: movaps 112(%r9), %xmm14 -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movaps 96(%r10), %xmm1 +; SSE-NEXT: movaps 96(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 96(%r8), %xmm13 +; SSE-NEXT: movaps 96(%r9), %xmm6 +; SSE-NEXT: movaps %xmm13, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] ; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm6[2],xmm13[3],xmm6[3] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[0,2] +; SSE-NEXT: movaps 112(%rdx), %xmm1 +; SSE-NEXT: movaps 112(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] ; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] +; SSE-NEXT: movaps 112(%r10), %xmm0 +; SSE-NEXT: movaps 112(%rax), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps 112(%r8), %xmm4 +; SSE-NEXT: movaps 112(%r9), %xmm9 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm5[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1008(%rax) -; SSE-NEXT: movaps %xmm0, 992(%rax) -; SSE-NEXT: movaps %xmm14, 976(%rax) -; SSE-NEXT: movaps %xmm13, 960(%rax) -; SSE-NEXT: movaps %xmm8, 944(%rax) -; SSE-NEXT: movaps %xmm9, 928(%rax) +; SSE-NEXT: movaps %xmm4, 1008(%rax) +; SSE-NEXT: movaps %xmm2, 992(%rax) +; SSE-NEXT: movaps %xmm9, 976(%rax) +; SSE-NEXT: movaps %xmm11, 960(%rax) +; SSE-NEXT: movaps %xmm6, 944(%rax) +; SSE-NEXT: movaps %xmm3, 928(%rax) ; SSE-NEXT: movaps %xmm10, 912(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 896(%rax) -; SSE-NEXT: movaps %xmm7, 880(%rax) -; SSE-NEXT: movaps %xmm12, 864(%rax) +; SSE-NEXT: movaps %xmm14, 896(%rax) +; SSE-NEXT: movaps %xmm13, 880(%rax) +; SSE-NEXT: movaps %xmm15, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps %xmm11, 816(%rax) -; SSE-NEXT: movaps %xmm15, 800(%rax) +; SSE-NEXT: movaps %xmm12, 816(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 800(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2319,53 +2324,53 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm10 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,0],ymm9[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,0],ymm9[4,5],ymm6[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm8[1,0],ymm10[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[2,3],ymm9[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,0],ymm8[1,0],ymm11[5,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[2,0],ymm6[2,3],ymm9[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2,0],ymm9[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,0],ymm9[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm7[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm9 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm8[3,0],ymm10[7,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm8[3,0],ymm11[7,4],ymm8[7,4] ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm10[2,0],ymm12[2,3],ymm10[6,4],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm10 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm12[2,3],ymm11[6,4],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] @@ -2373,21 +2378,21 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm9[1,0],ymm11[5,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm9[1,0],ymm10[5,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[4],ymm7[4],ymm2[5],ymm7[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] @@ -2395,126 +2400,126 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,0],ymm9[3,0],ymm11[7,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,0],ymm9[3,0],ymm10[7,4],ymm9[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm6[2,3],ymm8[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,0],ymm1[2,3],ymm7[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm5[1,0],ymm7[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm6[2,3],ymm8[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm5[3,0],ymm7[7,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,3],ymm0[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -2525,189 +2530,189 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -2717,11 +2722,11 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 832(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 800(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 608(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 512(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 608(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2784,154 +2789,154 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm7[0],ymm15[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm6 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 24(%r10), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm13[7] -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm8 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm10 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%rax), %ymm8 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 28(%rax), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[4],ymm5[4],ymm10[5],ymm5[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1],ymm8[2,3,4],ymm1[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 56(%r10), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm6[7] -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 56(%r10), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%rax), %ymm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastss 60(%rax), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 88(%r10), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 92(%rax), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 88(%r10), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 92(%rax), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm8 @@ -2939,37 +2944,37 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 120(%r10), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 120(%r10), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 124(%rax), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 124(%rax), %ymm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 @@ -3081,117 +3086,117 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm2[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm0 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm8 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm3[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm15 ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm5[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm2[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 832(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 800(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 608(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 544(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 512(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 832(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 800(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 608(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 576(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 512(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3250,285 +3255,290 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512F-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm12, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm16, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512F-NEXT: vpermt2d %zmm30, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm28, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm27 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm1 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm13, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm27, %zmm29 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm24, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm26, %zmm30 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2d %zmm1, %zmm19, %zmm9 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm27, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm27 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm23 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm24 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm11, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm8, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm6, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm5, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm19, %zmm3, %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm7 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm5 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512F-NEXT: movb $-120, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} ; AVX512F-NEXT: movb $34, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 960(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3539,285 +3549,290 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm28, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm27, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm26, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm27, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm27 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm11, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm8, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm5, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm5, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm7 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm5 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 ; AVX512BW-NEXT: movb $-120, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} ; AVX512BW-NEXT: movb $34, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm9 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3945,21 +3960,21 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm1 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 32(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r10), %xmm2 +; SSE-NEXT: movaps 32(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -3967,48 +3982,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm1 -; SSE-NEXT: movaps 48(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 48(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r10), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4016,48 +4031,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 64(%rdi), %xmm8 -; SSE-NEXT: movaps 64(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r10), %xmm1 -; SSE-NEXT: movaps 64(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 64(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps 64(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%r10), %xmm2 +; SSE-NEXT: movaps 64(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4065,48 +4080,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 80(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm1 -; SSE-NEXT: movaps 80(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 80(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdi), %xmm7 +; SSE-NEXT: movaps 80(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%r10), %xmm2 +; SSE-NEXT: movaps 80(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4114,48 +4129,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdx), %xmm0 -; SSE-NEXT: movaps 96(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 96(%rdi), %xmm8 -; SSE-NEXT: movaps 96(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm1 -; SSE-NEXT: movaps 96(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 96(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 96(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r10), %xmm2 +; SSE-NEXT: movaps 96(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 96(%r8), %xmm11 ; SSE-NEXT: movaps 96(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4163,48 +4178,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdx), %xmm0 -; SSE-NEXT: movaps 112(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 112(%rdi), %xmm8 -; SSE-NEXT: movaps 112(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r10), %xmm1 -; SSE-NEXT: movaps 112(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 112(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 112(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r10), %xmm2 +; SSE-NEXT: movaps 112(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 112(%r8), %xmm11 ; SSE-NEXT: movaps 112(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4212,48 +4227,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdx), %xmm0 -; SSE-NEXT: movaps 128(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 128(%rdi), %xmm8 -; SSE-NEXT: movaps 128(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%r10), %xmm1 -; SSE-NEXT: movaps 128(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 128(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 128(%rdi), %xmm7 +; SSE-NEXT: movaps 128(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%r10), %xmm2 +; SSE-NEXT: movaps 128(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 128(%r8), %xmm11 ; SSE-NEXT: movaps 128(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4261,48 +4276,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdx), %xmm0 -; SSE-NEXT: movaps 144(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 144(%rdi), %xmm8 -; SSE-NEXT: movaps 144(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%r10), %xmm1 -; SSE-NEXT: movaps 144(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 144(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 144(%rdi), %xmm7 +; SSE-NEXT: movaps 144(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%r10), %xmm2 +; SSE-NEXT: movaps 144(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 144(%r8), %xmm11 ; SSE-NEXT: movaps 144(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4310,48 +4325,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdx), %xmm0 -; SSE-NEXT: movaps 160(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps 160(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%r10), %xmm1 -; SSE-NEXT: movaps 160(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 160(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%r10), %xmm2 +; SSE-NEXT: movaps 160(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 160(%r8), %xmm11 ; SSE-NEXT: movaps 160(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4359,48 +4374,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdx), %xmm0 -; SSE-NEXT: movaps 176(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 176(%rdi), %xmm8 -; SSE-NEXT: movaps 176(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%r10), %xmm1 -; SSE-NEXT: movaps 176(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 176(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 176(%rdi), %xmm7 +; SSE-NEXT: movaps 176(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%r10), %xmm2 +; SSE-NEXT: movaps 176(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 176(%r8), %xmm11 ; SSE-NEXT: movaps 176(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4408,48 +4423,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdx), %xmm0 -; SSE-NEXT: movaps 192(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 192(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%r10), %xmm1 -; SSE-NEXT: movaps 192(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 192(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 192(%rdi), %xmm7 +; SSE-NEXT: movaps 192(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%r10), %xmm2 +; SSE-NEXT: movaps 192(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 192(%r8), %xmm11 ; SSE-NEXT: movaps 192(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4457,48 +4472,48 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdx), %xmm0 -; SSE-NEXT: movaps 208(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 208(%rdi), %xmm8 -; SSE-NEXT: movaps 208(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r10), %xmm1 -; SSE-NEXT: movaps 208(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 208(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 208(%rdi), %xmm7 +; SSE-NEXT: movaps 208(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%r10), %xmm2 +; SSE-NEXT: movaps 208(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 208(%r8), %xmm11 ; SSE-NEXT: movaps 208(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4506,135 +4521,136 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm0 -; SSE-NEXT: movaps 224(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 224(%rdx), %xmm1 +; SSE-NEXT: movaps 224(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps 224(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%r10), %xmm1 -; SSE-NEXT: movaps 224(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 224(%r8), %xmm8 +; SSE-NEXT: movaps 224(%rsi), %xmm4 +; SSE-NEXT: movaps %xmm12, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r10), %xmm2 +; SSE-NEXT: movaps 224(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 224(%r8), %xmm15 ; SSE-NEXT: movaps 224(%r9), %xmm6 -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm15, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] ; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rsi), %xmm13 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%r10), %xmm1 -; SSE-NEXT: movaps 240(%rax), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movaps 240(%r8), %xmm4 -; SSE-NEXT: movaps 240(%r9), %xmm14 -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] +; SSE-NEXT: movaps 240(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rsi), %xmm11 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] +; SSE-NEXT: movaps 240(%r10), %xmm0 +; SSE-NEXT: movaps 240(%rax), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps 240(%r8), %xmm5 +; SSE-NEXT: movaps 240(%r9), %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm3[2,0] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm3[2,0] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] ; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movaps %xmm4, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 2032(%rax) -; SSE-NEXT: movaps %xmm0, 2016(%rax) -; SSE-NEXT: movaps %xmm14, 2000(%rax) -; SSE-NEXT: movaps %xmm13, 1984(%rax) -; SSE-NEXT: movaps %xmm9, 1968(%rax) -; SSE-NEXT: movaps %xmm5, 1952(%rax) +; SSE-NEXT: movaps %xmm5, 2032(%rax) +; SSE-NEXT: movaps %xmm1, 2016(%rax) +; SSE-NEXT: movaps %xmm9, 2000(%rax) +; SSE-NEXT: movaps %xmm11, 1984(%rax) +; SSE-NEXT: movaps %xmm6, 1968(%rax) +; SSE-NEXT: movaps %xmm4, 1952(%rax) ; SSE-NEXT: movaps %xmm10, 1936(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1920(%rax) -; SSE-NEXT: movaps %xmm8, 1904(%rax) +; SSE-NEXT: movaps %xmm14, 1920(%rax) +; SSE-NEXT: movaps %xmm15, 1904(%rax) ; SSE-NEXT: movaps %xmm12, 1888(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1872(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1856(%rax) -; SSE-NEXT: movaps %xmm11, 1840(%rax) -; SSE-NEXT: movaps %xmm15, 1824(%rax) +; SSE-NEXT: movaps %xmm13, 1840(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1824(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1808(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4879,50 +4895,50 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm11 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,0],ymm9[2,3],ymm10[6,4],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm9[1,0],ymm7[1,0],ymm9[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm10 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[2,0],ymm12[2,3],ymm8[6,4],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm9[2,0],ymm12[2,3],ymm9[6,4],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm9 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] @@ -4931,21 +4947,21 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[4],ymm9[4],ymm7[5],ymm9[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[4],ymm9[4],ymm2[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm8[0],ymm2[1],ymm8[1],ymm2[4],ymm8[4],ymm2[5],ymm8[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] @@ -4953,25 +4969,25 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[6],ymm9[6],ymm7[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm8[2],ymm2[3],ymm8[3],ymm2[6],ymm8[6],ymm2[7],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 @@ -4983,309 +4999,309 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,0],ymm1[2,3],ymm7[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,3],ymm0[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,3],ymm0[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%r10), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%r10), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 160(%r10), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%r10), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 192(%r10), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%r10), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%r10), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%r10), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,3],ymm2[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,3],ymm0[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[3,0],ymm4[7,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 @@ -5293,43 +5309,43 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] @@ -5337,343 +5353,343 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 128(%r10), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm0[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 160(%r10), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -5683,11 +5699,11 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 1856(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 1824(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 1792(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1632(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 1600(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 1568(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 1536(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1632(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1600(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 1568(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 1536(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5817,151 +5833,151 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm11 ; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm9[0],ymm15[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[4],ymm11[4],ymm9[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm8[0],ymm15[2],ymm8[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm8 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm10 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm8 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm7 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[6],ymm12[6],ymm7[7],ymm12[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[6],ymm12[6],ymm5[7],ymm12[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm7[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[4],ymm10[4],ymm8[5],ymm10[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm6[7] -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[4],ymm7[4],ymm4[5],ymm7[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 84(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 88(%rax), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 92(%r10), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 88(%rax), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[6],ymm0[6],ymm6[7],ymm0[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 92(%r10), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 96(%r10), %ymm8 @@ -5969,37 +5985,37 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 116(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 120(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 120(%rax), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 124(%r10), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 124(%r10), %ymm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 @@ -6523,8 +6539,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm3 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -6536,104 +6552,104 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 ; AVX2-ONLY-NEXT: vmovaps 192(%r10), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm6 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 -; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 224(%r10), %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm15 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm15 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm5[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm2[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1856(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1824(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1792(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1632(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1600(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1568(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1536(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1856(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1824(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1792(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1632(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1600(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1568(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1536(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1376(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6752,92 +6768,97 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i32_stride8_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6152, %rsp # imm = 0x1808 +; AVX512F-NEXT: subq $6216, %rsp # imm = 0x1848 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: vmovdqa64 (%r10), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm24 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm0 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm30 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm26, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm1, %zmm26, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm1 ; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm0 @@ -6845,196 +6866,209 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm25 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512F-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm22 ; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm27 ; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm23, %zmm8 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm27, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm14 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 ; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm2 @@ -7045,228 +7079,222 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm30 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm4, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm29 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm15 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm15 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 ; AVX512F-NEXT: movb $-120, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: movb $34, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm5 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm31 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm31 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm30 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm28 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm28 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm12 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm27 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm24 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm22 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7283,24 +7311,30 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7323,8 +7357,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7338,33 +7372,33 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 {%k3} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 1984(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm3, 1920(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm10, 1664(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm11, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1472(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 1536(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1408(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm18, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 1280(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm23, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm24, 1024(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm12, 960(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm28, 896(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm29, 832(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm30, 768(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm31, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%rax) @@ -7382,98 +7416,103 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: addq $6152, %rsp # imm = 0x1808 +; AVX512F-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride8_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6152, %rsp # imm = 0x1808 +; AVX512BW-NEXT: subq $6216, %rsp # imm = 0x1848 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm24 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm0 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm0 @@ -7481,196 +7520,209 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm26 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm25 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm22 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm27 ; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm23, %zmm8 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm27, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm2 @@ -7681,228 +7733,222 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 ; AVX512BW-NEXT: movb $-120, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: movb $34, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm5 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm28 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm28 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm12 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm24 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm22 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7919,24 +7965,30 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7959,8 +8011,8 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} @@ -7974,33 +8026,33 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 1920(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 1664(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1472(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1408(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 1280(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 1024(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 960(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 896(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm29, 832(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm30, 768(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rax) @@ -8018,7 +8070,7 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: addq $6152, %rsp # imm = 0x1808 +; AVX512BW-NEXT: addq $6216, %rsp # imm = 0x1848 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll index c794b0fd83339..a9a213c8a5905 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -407,49 +407,49 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movaps 112(%rdi), %xmm14 ; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps 64(%rdi), %xmm10 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm12 ; SSE-NEXT: movaps 96(%rsi), %xmm0 ; SSE-NEXT: movaps 80(%rsi), %xmm1 ; SSE-NEXT: movaps 64(%rsi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm4 ; SSE-NEXT: movaps 32(%rsi), %xmm5 -; SSE-NEXT: movaps 48(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movaps 48(%rsi), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm5[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -461,62 +461,62 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: movaps 144(%rdi), %xmm10 -; SSE-NEXT: movaps 144(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps 160(%rdi), %xmm12 +; SSE-NEXT: movaps 128(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps 160(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 176(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm8 ; SSE-NEXT: movaps 176(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm1 -; SSE-NEXT: movaps 208(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 192(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm5 +; SSE-NEXT: movaps 208(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movaps 224(%rdi), %xmm1 ; SSE-NEXT: movaps 224(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps 240(%rsi), %xmm5 +; SSE-NEXT: movaps 240(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm3, 496(%rdx) ; SSE-NEXT: movaps %xmm0, 480(%rdx) -; SSE-NEXT: movaps %xmm2, 464(%rdx) -; SSE-NEXT: movaps %xmm4, 448(%rdx) -; SSE-NEXT: movaps %xmm1, 432(%rdx) -; SSE-NEXT: movaps %xmm6, 416(%rdx) -; SSE-NEXT: movaps %xmm7, 400(%rdx) -; SSE-NEXT: movaps %xmm8, 384(%rdx) -; SSE-NEXT: movaps %xmm9, 368(%rdx) +; SSE-NEXT: movaps %xmm1, 464(%rdx) +; SSE-NEXT: movaps %xmm2, 448(%rdx) +; SSE-NEXT: movaps %xmm5, 432(%rdx) +; SSE-NEXT: movaps %xmm7, 416(%rdx) +; SSE-NEXT: movaps %xmm6, 400(%rdx) +; SSE-NEXT: movaps %xmm9, 384(%rdx) +; SSE-NEXT: movaps %xmm8, 368(%rdx) ; SSE-NEXT: movaps %xmm11, 352(%rdx) -; SSE-NEXT: movaps %xmm12, 336(%rdx) -; SSE-NEXT: movaps %xmm13, 320(%rdx) -; SSE-NEXT: movaps %xmm10, 304(%rdx) -; SSE-NEXT: movaps %xmm15, 288(%rdx) -; SSE-NEXT: movaps %xmm14, 272(%rdx) +; SSE-NEXT: movaps %xmm10, 336(%rdx) +; SSE-NEXT: movaps %xmm12, 320(%rdx) +; SSE-NEXT: movaps %xmm13, 304(%rdx) +; SSE-NEXT: movaps %xmm14, 288(%rdx) +; SSE-NEXT: movaps %xmm15, 272(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -645,10 +645,10 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm10 @@ -672,11 +672,11 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] @@ -684,11 +684,11 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] @@ -715,11 +715,11 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 256(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 288(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) @@ -831,178 +831,178 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: movaps 128(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 144(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 144(%rdi), %xmm1 +; SSE-NEXT: movaps 144(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm2 -; SSE-NEXT: movaps 160(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 160(%rdi), %xmm1 +; SSE-NEXT: movaps 160(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm2 -; SSE-NEXT: movaps 176(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm2 -; SSE-NEXT: movaps 192(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdi), %xmm1 +; SSE-NEXT: movaps 192(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm2 -; SSE-NEXT: movaps 208(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm1 +; SSE-NEXT: movaps 208(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 224(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 224(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm2 -; SSE-NEXT: movaps 256(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 256(%rdi), %xmm1 +; SSE-NEXT: movaps 256(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm2 -; SSE-NEXT: movaps 272(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 272(%rdi), %xmm1 +; SSE-NEXT: movaps 272(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm2 -; SSE-NEXT: movaps 288(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 288(%rdi), %xmm1 +; SSE-NEXT: movaps 288(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps 304(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 304(%rdi), %xmm1 +; SSE-NEXT: movaps 304(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps 320(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm2 -; SSE-NEXT: movaps 336(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps 336(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm2 -; SSE-NEXT: movaps 352(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 352(%rdi), %xmm1 +; SSE-NEXT: movaps 352(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm15 ; SSE-NEXT: movaps 368(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 384(%rdi), %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 384(%rdi), %xmm13 ; SSE-NEXT: movaps 384(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 400(%rdi), %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 400(%rdi), %xmm11 ; SSE-NEXT: movaps 400(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 416(%rdi), %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 416(%rdi), %xmm12 ; SSE-NEXT: movaps 416(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 432(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 432(%rdi), %xmm8 ; SSE-NEXT: movaps 432(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 448(%rdi), %xmm5 -; SSE-NEXT: movaps 448(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 448(%rdi), %xmm6 +; SSE-NEXT: movaps 448(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdi), %xmm5 +; SSE-NEXT: movaps 464(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm1 -; SSE-NEXT: movaps 464(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps 480(%rdi), %xmm2 +; SSE-NEXT: movaps 480(%rdi), %xmm1 ; SSE-NEXT: movaps 480(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps 496(%rdi), %xmm3 -; SSE-NEXT: movaps 496(%rsi), %xmm6 +; SSE-NEXT: movaps 496(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm3, 1008(%rdx) ; SSE-NEXT: movaps %xmm0, 992(%rdx) -; SSE-NEXT: movaps %xmm2, 976(%rdx) -; SSE-NEXT: movaps %xmm4, 960(%rdx) -; SSE-NEXT: movaps %xmm1, 944(%rdx) +; SSE-NEXT: movaps %xmm1, 976(%rdx) +; SSE-NEXT: movaps %xmm2, 960(%rdx) +; SSE-NEXT: movaps %xmm5, 944(%rdx) ; SSE-NEXT: movaps %xmm7, 928(%rdx) -; SSE-NEXT: movaps %xmm5, 912(%rdx) -; SSE-NEXT: movaps %xmm8, 896(%rdx) -; SSE-NEXT: movaps %xmm10, 880(%rdx) -; SSE-NEXT: movaps %xmm11, 864(%rdx) -; SSE-NEXT: movaps %xmm9, 848(%rdx) -; SSE-NEXT: movaps %xmm12, 832(%rdx) -; SSE-NEXT: movaps %xmm14, 816(%rdx) +; SSE-NEXT: movaps %xmm6, 912(%rdx) +; SSE-NEXT: movaps %xmm9, 896(%rdx) +; SSE-NEXT: movaps %xmm8, 880(%rdx) +; SSE-NEXT: movaps %xmm10, 864(%rdx) +; SSE-NEXT: movaps %xmm12, 848(%rdx) +; SSE-NEXT: movaps %xmm14, 832(%rdx) +; SSE-NEXT: movaps %xmm11, 816(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rdx) -; SSE-NEXT: movaps %xmm15, 784(%rdx) +; SSE-NEXT: movaps %xmm13, 784(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rdx) -; SSE-NEXT: movaps %xmm13, 752(%rdx) +; SSE-NEXT: movaps %xmm15, 752(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 736(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1304,85 +1304,85 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-LABEL: store_i64_stride2_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm8[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm6[0,1,1,3] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm10[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm7[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm10[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm12[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm11[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm11[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm12[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm3[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm12[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm13[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,2,2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm1 @@ -1410,22 +1410,22 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm1[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,0,2,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll index 6ae1465d3438e..780c3c55d3d3d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -340,79 +340,78 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i64_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movapd 64(%rdi), %xmm4 -; SSE-NEXT: movapd (%rdi), %xmm0 -; SSE-NEXT: movapd 16(%rdi), %xmm1 -; SSE-NEXT: movapd 32(%rdi), %xmm2 -; SSE-NEXT: movapd 48(%rdi), %xmm5 +; SSE-NEXT: movapd 64(%rdi), %xmm5 +; SSE-NEXT: movapd (%rdi), %xmm1 +; SSE-NEXT: movapd 16(%rdi), %xmm2 +; SSE-NEXT: movapd 32(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm6 ; SSE-NEXT: movapd 64(%rsi), %xmm9 -; SSE-NEXT: movapd (%rsi), %xmm3 -; SSE-NEXT: movapd 16(%rsi), %xmm6 -; SSE-NEXT: movapd 32(%rsi), %xmm7 +; SSE-NEXT: movapd (%rsi), %xmm4 +; SSE-NEXT: movapd 16(%rsi), %xmm7 +; SSE-NEXT: movapd 32(%rsi), %xmm11 ; SSE-NEXT: movapd 48(%rsi), %xmm10 ; SSE-NEXT: movapd 64(%rdx), %xmm15 -; SSE-NEXT: movapd (%rdx), %xmm11 +; SSE-NEXT: movapd (%rdx), %xmm0 ; SSE-NEXT: movapd 16(%rdx), %xmm12 ; SSE-NEXT: movapd 32(%rdx), %xmm13 ; SSE-NEXT: movapd 48(%rdx), %xmm14 -; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; SSE-NEXT: movapd %xmm1, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] ; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm1, %xmm11 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm2, %xmm12 -; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm7[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm5, %xmm13 +; SSE-NEXT: movapd %xmm3, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] +; SSE-NEXT: movapd %xmm6, %xmm13 ; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm14[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm14[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1] -; SSE-NEXT: movapd %xmm4, %xmm14 +; SSE-NEXT: movapd %xmm5, %xmm14 ; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm15[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] ; SSE-NEXT: movapd 80(%rdi), %xmm15 -; SSE-NEXT: movapd 80(%rsi), %xmm6 +; SSE-NEXT: movapd 80(%rsi), %xmm7 ; SSE-NEXT: movapd %xmm15, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; SSE-NEXT: movapd 80(%rdx), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movapd 96(%rdi), %xmm4 -; SSE-NEXT: movapd 96(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm4, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd 96(%rsi), %xmm3 +; SSE-NEXT: movapd %xmm4, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] ; SSE-NEXT: movapd 96(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movapd 112(%rdi), %xmm2 ; SSE-NEXT: movapd 112(%rsi), %xmm0 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movapd 112(%rdx), %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movapd %xmm0, 368(%rcx) ; SSE-NEXT: movapd %xmm2, 352(%rcx) -; SSE-NEXT: movapd %xmm3, 336(%rcx) -; SSE-NEXT: movapd %xmm1, 320(%rcx) +; SSE-NEXT: movapd %xmm1, 336(%rcx) +; SSE-NEXT: movapd %xmm3, 320(%rcx) ; SSE-NEXT: movapd %xmm4, 304(%rcx) -; SSE-NEXT: movapd %xmm7, 288(%rcx) -; SSE-NEXT: movapd %xmm6, 272(%rcx) +; SSE-NEXT: movapd %xmm6, 288(%rcx) +; SSE-NEXT: movapd %xmm7, 272(%rcx) ; SSE-NEXT: movapd %xmm15, 256(%rcx) ; SSE-NEXT: movapd %xmm8, 240(%rcx) ; SSE-NEXT: movapd %xmm9, 224(%rcx) @@ -423,8 +422,7 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rcx) ; SSE-NEXT: movapd %xmm13, 144(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rcx) +; SSE-NEXT: movapd %xmm11, 128(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movapd %xmm12, 96(%rcx) @@ -432,7 +430,8 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) -; SSE-NEXT: movapd %xmm11, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -524,13 +523,13 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride3_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] @@ -539,37 +538,37 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm12[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm13[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] @@ -585,13 +584,13 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) @@ -751,67 +750,67 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 160(%rdi), %xmm1 -; SSE-NEXT: movapd 160(%rsi), %xmm14 +; SSE-NEXT: movapd 160(%rsi), %xmm15 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm14[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm15[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 160(%rdx), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movapd 176(%rdi), %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movapd 176(%rdi), %xmm13 ; SSE-NEXT: movapd 176(%rsi), %xmm12 -; SSE-NEXT: movapd %xmm15, %xmm0 +; SSE-NEXT: movapd %xmm13, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 176(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movapd 192(%rdi), %xmm10 +; SSE-NEXT: movapd 192(%rdi), %xmm9 ; SSE-NEXT: movapd 192(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm10, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: movapd %xmm9, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm8[0] ; SSE-NEXT: movapd 192(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movapd 208(%rdi), %xmm9 -; SSE-NEXT: movapd 208(%rsi), %xmm6 -; SSE-NEXT: movapd %xmm9, %xmm11 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movapd 208(%rdi), %xmm10 +; SSE-NEXT: movapd 208(%rsi), %xmm7 +; SSE-NEXT: movapd %xmm10, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] ; SSE-NEXT: movapd 208(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movapd 224(%rdi), %xmm4 -; SSE-NEXT: movapd 224(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm4, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd 224(%rsi), %xmm3 +; SSE-NEXT: movapd %xmm4, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] ; SSE-NEXT: movapd 224(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 240(%rsi), %xmm0 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movapd 240(%rdx), %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movapd %xmm0, 752(%rcx) ; SSE-NEXT: movapd %xmm2, 736(%rcx) -; SSE-NEXT: movapd %xmm3, 720(%rcx) -; SSE-NEXT: movapd %xmm1, 704(%rcx) +; SSE-NEXT: movapd %xmm1, 720(%rcx) +; SSE-NEXT: movapd %xmm3, 704(%rcx) ; SSE-NEXT: movapd %xmm4, 688(%rcx) -; SSE-NEXT: movapd %xmm7, 672(%rcx) -; SSE-NEXT: movapd %xmm6, 656(%rcx) -; SSE-NEXT: movapd %xmm9, 640(%rcx) +; SSE-NEXT: movapd %xmm6, 672(%rcx) +; SSE-NEXT: movapd %xmm7, 656(%rcx) +; SSE-NEXT: movapd %xmm10, 640(%rcx) ; SSE-NEXT: movapd %xmm11, 624(%rcx) ; SSE-NEXT: movapd %xmm8, 608(%rcx) -; SSE-NEXT: movapd %xmm10, 592(%rcx) -; SSE-NEXT: movapd %xmm13, 576(%rcx) +; SSE-NEXT: movapd %xmm9, 592(%rcx) +; SSE-NEXT: movapd %xmm14, 576(%rcx) ; SSE-NEXT: movapd %xmm12, 560(%rcx) -; SSE-NEXT: movapd %xmm15, 544(%rcx) +; SSE-NEXT: movapd %xmm13, 544(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 528(%rcx) -; SSE-NEXT: movapd %xmm14, 512(%rcx) +; SSE-NEXT: movapd %xmm15, 512(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -886,62 +885,62 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] @@ -957,48 +956,48 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm9 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 128(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm6[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm5[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm4[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2],ymm8[3] ; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm15 ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2],ymm14[3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],mem[2],ymm14[3] @@ -1009,32 +1008,32 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm11, 736(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 704(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 640(%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm12, 736(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 704(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 640(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 608(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 512(%rcx) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 416(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 320(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 320(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rcx) @@ -1043,7 +1042,7 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd %ymm13, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1067,96 +1066,96 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride3_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $168, %rsp -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm5[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[2,1,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[0,1,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,1,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm10[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] @@ -1166,21 +1165,21 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 192(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] @@ -1188,16 +1187,16 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 224(%rdx), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 704(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 704(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 640(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 608(%rcx) @@ -1205,12 +1204,12 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm3, 544(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 512(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 480(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 448(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 416(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1582,56 +1581,56 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movapd 432(%rdi), %xmm13 +; SSE-NEXT: movapd 432(%rdi), %xmm14 ; SSE-NEXT: movapd 432(%rsi), %xmm12 -; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: movapd %xmm14, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 432(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movapd 448(%rdi), %xmm10 +; SSE-NEXT: movapd 448(%rdi), %xmm9 ; SSE-NEXT: movapd 448(%rsi), %xmm8 -; SSE-NEXT: movapd %xmm10, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm8[0] +; SSE-NEXT: movapd %xmm9, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] ; SSE-NEXT: movapd 448(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movapd 464(%rdi), %xmm9 -; SSE-NEXT: movapd 464(%rsi), %xmm6 -; SSE-NEXT: movapd %xmm9, %xmm11 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm6[0] +; SSE-NEXT: movapd 464(%rdi), %xmm10 +; SSE-NEXT: movapd 464(%rsi), %xmm7 +; SSE-NEXT: movapd %xmm10, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] ; SSE-NEXT: movapd 464(%rdx), %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movapd 480(%rdi), %xmm4 -; SSE-NEXT: movapd 480(%rsi), %xmm1 -; SSE-NEXT: movapd %xmm4, %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd 480(%rsi), %xmm3 +; SSE-NEXT: movapd %xmm4, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] ; SSE-NEXT: movapd 480(%rdx), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movapd 496(%rdi), %xmm2 ; SSE-NEXT: movapd 496(%rsi), %xmm0 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movapd %xmm2, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movapd 496(%rdx), %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movapd %xmm0, 1520(%rcx) ; SSE-NEXT: movapd %xmm2, 1504(%rcx) -; SSE-NEXT: movapd %xmm3, 1488(%rcx) -; SSE-NEXT: movapd %xmm1, 1472(%rcx) +; SSE-NEXT: movapd %xmm1, 1488(%rcx) +; SSE-NEXT: movapd %xmm3, 1472(%rcx) ; SSE-NEXT: movapd %xmm4, 1456(%rcx) -; SSE-NEXT: movapd %xmm7, 1440(%rcx) -; SSE-NEXT: movapd %xmm6, 1424(%rcx) -; SSE-NEXT: movapd %xmm9, 1408(%rcx) +; SSE-NEXT: movapd %xmm6, 1440(%rcx) +; SSE-NEXT: movapd %xmm7, 1424(%rcx) +; SSE-NEXT: movapd %xmm10, 1408(%rcx) ; SSE-NEXT: movapd %xmm11, 1392(%rcx) ; SSE-NEXT: movapd %xmm8, 1376(%rcx) -; SSE-NEXT: movapd %xmm10, 1360(%rcx) -; SSE-NEXT: movapd %xmm14, 1344(%rcx) +; SSE-NEXT: movapd %xmm9, 1360(%rcx) +; SSE-NEXT: movapd %xmm13, 1344(%rcx) ; SSE-NEXT: movapd %xmm12, 1328(%rcx) -; SSE-NEXT: movapd %xmm13, 1312(%rcx) +; SSE-NEXT: movapd %xmm14, 1312(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1296(%rcx) ; SSE-NEXT: movapd %xmm15, 1280(%rcx) @@ -1898,7 +1897,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm1 @@ -1947,101 +1946,107 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm14 +; AVX1-ONLY-NEXT: vmovapd 160(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 192(%rdx), %ymm14 ; AVX1-ONLY-NEXT: vmovapd 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3] ; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 224(%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 256(%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 288(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 288(%rdx), %ymm9 ; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 320(%rdx), %ymm7 ; AVX1-ONLY-NEXT: vmovapd 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 352(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovapd 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovapd 352(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 384(%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm10[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm8[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm4[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[1,0,2,2] @@ -2079,7 +2084,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] @@ -2091,30 +2096,25 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm15[0],ymm5[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2],ymm13[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],mem[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2],ymm10[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1504(%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 1504(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 1472(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1408(%rcx) @@ -2124,31 +2124,32 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovapd %ymm3, 1280(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1216(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 1184(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 1184(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1120(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 1088(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 1088(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 1024(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 992(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 992(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 928(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 896(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 896(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 832(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 800(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 800(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 736(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 704(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 704(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 640(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 608(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 608(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 512(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 512(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2163,7 +2164,7 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) @@ -2407,91 +2408,91 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 384(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 416(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm1[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1504(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1472(%rcx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1504(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 1472(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1408(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1376(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1376(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1344(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 1312(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 1280(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1248(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1216(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1184(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1152(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1120(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1088(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1056(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1280(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1248(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1216(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1184(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1152(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1120(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1088(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1056(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll index 3c33ef722a2f7..6db64bdc2adac 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -199,7 +199,7 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm12 +; SSE-NEXT: movaps 16(%rsi), %xmm14 ; SSE-NEXT: movaps 32(%rsi), %xmm11 ; SSE-NEXT: movaps (%rdx), %xmm2 ; SSE-NEXT: movaps 16(%rdx), %xmm4 @@ -207,8 +207,8 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps 48(%rdx), %xmm9 ; SSE-NEXT: movaps (%rcx), %xmm8 ; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps 32(%rcx), %xmm14 -; SSE-NEXT: movaps 48(%rcx), %xmm15 +; SSE-NEXT: movaps 32(%rcx), %xmm15 +; SSE-NEXT: movaps 48(%rcx), %xmm12 ; SSE-NEXT: movaps %xmm2, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -220,29 +220,29 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm11[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] ; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps 48(%rsi), %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] +; SSE-NEXT: movaps 48(%rsi), %xmm12 ; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: movaps %xmm0, 224(%r8) ; SSE-NEXT: movaps %xmm9, 240(%r8) ; SSE-NEXT: movaps %xmm6, 192(%r8) ; SSE-NEXT: movaps %xmm11, 208(%r8) ; SSE-NEXT: movaps %xmm3, 160(%r8) ; SSE-NEXT: movaps %xmm7, 176(%r8) -; SSE-NEXT: movaps %xmm14, 128(%r8) -; SSE-NEXT: movaps %xmm12, 144(%r8) +; SSE-NEXT: movaps %xmm15, 128(%r8) +; SSE-NEXT: movaps %xmm14, 144(%r8) ; SSE-NEXT: movaps %xmm1, 96(%r8) ; SSE-NEXT: movaps %xmm4, 112(%r8) ; SSE-NEXT: movaps %xmm13, 64(%r8) @@ -437,118 +437,118 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i64_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps 48(%rdi), %xmm10 +; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm2 -; SSE-NEXT: movaps 32(%rsi), %xmm0 -; SSE-NEXT: movaps 48(%rsi), %xmm15 -; SSE-NEXT: movaps (%rdx), %xmm10 -; SSE-NEXT: movaps 16(%rdx), %xmm11 -; SSE-NEXT: movaps 32(%rdx), %xmm13 -; SSE-NEXT: movaps 48(%rdx), %xmm14 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm4 -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: movaps 32(%rsi), %xmm1 +; SSE-NEXT: movaps 48(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movaps 16(%rdx), %xmm12 +; SSE-NEXT: movaps 32(%rdx), %xmm14 +; SSE-NEXT: movaps 48(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movaps 16(%rcx), %xmm5 +; SSE-NEXT: movaps 32(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm15 -; SSE-NEXT: movaps 64(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] -; SSE-NEXT: movaps 64(%rdx), %xmm11 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps 80(%rdi), %xmm9 +; SSE-NEXT: movaps 64(%rdx), %xmm12 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdi), %xmm11 ; SSE-NEXT: movaps 80(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movaps %xmm11, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdx), %xmm10 -; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 80(%rdx), %xmm7 +; SSE-NEXT: movaps 80(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps 96(%rdi), %xmm8 +; SSE-NEXT: movaps 96(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdx), %xmm5 ; SSE-NEXT: movaps 96(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps 112(%rdi), %xmm2 ; SSE-NEXT: movaps 112(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movaps 112(%rdx), %xmm3 -; SSE-NEXT: movaps 112(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps 112(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, 496(%r8) -; SSE-NEXT: movaps %xmm0, 480(%r8) -; SSE-NEXT: movaps %xmm1, 464(%r8) -; SSE-NEXT: movaps %xmm2, 448(%r8) +; SSE-NEXT: movaps %xmm2, 480(%r8) +; SSE-NEXT: movaps %xmm0, 464(%r8) +; SSE-NEXT: movaps %xmm4, 448(%r8) ; SSE-NEXT: movaps %xmm5, 432(%r8) -; SSE-NEXT: movaps %xmm7, 416(%r8) +; SSE-NEXT: movaps %xmm8, 416(%r8) ; SSE-NEXT: movaps %xmm6, 400(%r8) -; SSE-NEXT: movaps %xmm8, 384(%r8) -; SSE-NEXT: movaps %xmm10, 368(%r8) -; SSE-NEXT: movaps %xmm9, 352(%r8) -; SSE-NEXT: movaps %xmm12, 336(%r8) +; SSE-NEXT: movaps %xmm9, 384(%r8) +; SSE-NEXT: movaps %xmm7, 368(%r8) +; SSE-NEXT: movaps %xmm11, 352(%r8) +; SSE-NEXT: movaps %xmm10, 336(%r8) ; SSE-NEXT: movaps %xmm13, 320(%r8) -; SSE-NEXT: movaps %xmm11, 304(%r8) -; SSE-NEXT: movaps %xmm15, 288(%r8) -; SSE-NEXT: movaps %xmm14, 272(%r8) +; SSE-NEXT: movaps %xmm12, 304(%r8) +; SSE-NEXT: movaps %xmm14, 288(%r8) +; SSE-NEXT: movaps %xmm15, 272(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -641,35 +641,35 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm14[0],xmm12[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm15[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm15[0],xmm14[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r8) @@ -678,10 +678,10 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm7, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 176(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm11, 304(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm10, 288(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 272(%r8) @@ -710,86 +710,86 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride4_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm13 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm3, %ymm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm8, %ymm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm12, %ymm13 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm12, %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm6, %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm10, %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm9[1],ymm15[1],ymm9[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm13[1],ymm5[3],ymm13[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 480(%r8) +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm3, 480(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 288(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -969,9 +969,9 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm2 @@ -1100,73 +1100,73 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdx), %xmm13 +; SSE-NEXT: movaps 176(%rdx), %xmm14 ; SSE-NEXT: movaps 176(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdi), %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdi), %xmm12 ; SSE-NEXT: movaps 192(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 192(%rdx), %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 192(%rdx), %xmm15 ; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdi), %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm10 ; SSE-NEXT: movaps 208(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdx), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdx), %xmm7 ; SSE-NEXT: movaps 208(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movaps %xmm7, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 224(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: movaps 224(%rdx), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps 224(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdx), %xmm4 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps 240(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps 240(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdx), %xmm3 -; SSE-NEXT: movaps 240(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps 240(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, 1008(%r8) -; SSE-NEXT: movaps %xmm0, 992(%r8) -; SSE-NEXT: movaps %xmm1, 976(%r8) -; SSE-NEXT: movaps %xmm2, 960(%r8) -; SSE-NEXT: movaps %xmm6, 944(%r8) -; SSE-NEXT: movaps %xmm5, 928(%r8) -; SSE-NEXT: movaps %xmm7, 912(%r8) -; SSE-NEXT: movaps %xmm8, 896(%r8) -; SSE-NEXT: movaps %xmm10, 880(%r8) -; SSE-NEXT: movaps %xmm9, 864(%r8) +; SSE-NEXT: movaps %xmm2, 992(%r8) +; SSE-NEXT: movaps %xmm0, 976(%r8) +; SSE-NEXT: movaps %xmm5, 960(%r8) +; SSE-NEXT: movaps %xmm4, 944(%r8) +; SSE-NEXT: movaps %xmm8, 928(%r8) +; SSE-NEXT: movaps %xmm6, 912(%r8) +; SSE-NEXT: movaps %xmm9, 896(%r8) +; SSE-NEXT: movaps %xmm7, 880(%r8) +; SSE-NEXT: movaps %xmm10, 864(%r8) ; SSE-NEXT: movaps %xmm11, 848(%r8) -; SSE-NEXT: movaps %xmm12, 832(%r8) -; SSE-NEXT: movaps %xmm14, 816(%r8) -; SSE-NEXT: movaps %xmm15, 800(%r8) +; SSE-NEXT: movaps %xmm13, 832(%r8) +; SSE-NEXT: movaps %xmm15, 816(%r8) +; SSE-NEXT: movaps %xmm12, 800(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%r8) -; SSE-NEXT: movaps %xmm13, 752(%r8) +; SSE-NEXT: movaps %xmm14, 752(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 736(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1416,10 +1416,10 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] @@ -1620,56 +1620,56 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm11[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %ymm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm9[1],ymm3[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm14 @@ -1682,15 +1682,15 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovaps %ymm11, 992(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 864(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 864(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 832(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 736(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 704(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 608(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 576(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 736(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 704(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 608(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 576(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 352(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) @@ -2368,62 +2368,62 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 448(%rdi), %xmm13 +; SSE-NEXT: movaps 448(%rdi), %xmm12 ; SSE-NEXT: movaps 448(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 448(%rdx), %xmm12 -; SSE-NEXT: movaps 448(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 448(%rdx), %xmm11 +; SSE-NEXT: movaps 448(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdi), %xmm13 ; SSE-NEXT: movaps 464(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdx), %xmm10 -; SSE-NEXT: movaps 464(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm7 -; SSE-NEXT: movaps 480(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdx), %xmm7 +; SSE-NEXT: movaps 464(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps 480(%rdi), %xmm8 +; SSE-NEXT: movaps 480(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 480(%rdx), %xmm5 ; SSE-NEXT: movaps 480(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm0 +; SSE-NEXT: movaps 496(%rdi), %xmm2 ; SSE-NEXT: movaps 496(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movaps 496(%rdx), %xmm3 -; SSE-NEXT: movaps 496(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps 496(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, 2032(%r8) -; SSE-NEXT: movaps %xmm0, 2016(%r8) -; SSE-NEXT: movaps %xmm1, 2000(%r8) -; SSE-NEXT: movaps %xmm2, 1984(%r8) +; SSE-NEXT: movaps %xmm2, 2016(%r8) +; SSE-NEXT: movaps %xmm0, 2000(%r8) +; SSE-NEXT: movaps %xmm4, 1984(%r8) ; SSE-NEXT: movaps %xmm5, 1968(%r8) -; SSE-NEXT: movaps %xmm7, 1952(%r8) +; SSE-NEXT: movaps %xmm8, 1952(%r8) ; SSE-NEXT: movaps %xmm6, 1936(%r8) ; SSE-NEXT: movaps %xmm9, 1920(%r8) -; SSE-NEXT: movaps %xmm10, 1904(%r8) -; SSE-NEXT: movaps %xmm8, 1888(%r8) -; SSE-NEXT: movaps %xmm11, 1872(%r8) +; SSE-NEXT: movaps %xmm7, 1904(%r8) +; SSE-NEXT: movaps %xmm13, 1888(%r8) +; SSE-NEXT: movaps %xmm10, 1872(%r8) ; SSE-NEXT: movaps %xmm14, 1856(%r8) -; SSE-NEXT: movaps %xmm12, 1840(%r8) -; SSE-NEXT: movaps %xmm13, 1824(%r8) +; SSE-NEXT: movaps %xmm11, 1840(%r8) +; SSE-NEXT: movaps %xmm12, 1824(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1808(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2998,10 +2998,10 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] @@ -3458,78 +3458,78 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 352(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 352(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm14[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovaps %ymm10, 2016(%r8) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm11, 2016(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1984(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1888(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1888(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1856(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 1760(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1728(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1632(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1600(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1504(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1472(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1376(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1344(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1760(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1728(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1632(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1600(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1504(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1472(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1376(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1344(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3640,583 +3640,591 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512F-LABEL: store_i64_stride4_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm11 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm10 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm8 -; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm16 -; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512F-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm7 +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm12, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512F-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm31 +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm28 ; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm31, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm31, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm31, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm28, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,8,u,u,1,9,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,10,u,u,3,11,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,10,u,u,3,11,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <4,12,u,u,5,13,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <6,14,u,u,7,15,u,u> -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm27 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm9 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm10 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <6,14,u,u,7,15,u,u> +; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm20 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm18 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm22, %zmm19 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm9 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm0, 1984(%r8) +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, 1984(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm2, 1920(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 1856(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1792(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1728(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1664(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm6, 1600(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm12, 1536(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 1472(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 1408(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, 1344(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1280(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 1216(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1152(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1088(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1024(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, 896(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 1856(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1664(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 1600(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 1536(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 1472(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 1408(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, 1344(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 1280(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1216(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 1152(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1088(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1024(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm19, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 896(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm21, 832(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm9, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 704(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm24, 640(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm30, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 320(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512F-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm31, 128(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r8) -; AVX512F-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512F-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512F-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride4_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm16 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm15 +; AVX512BW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm31 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm28 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm31, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm28, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,8,u,u,1,9,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,10,u,u,3,11,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,10,u,u,3,11,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <4,12,u,u,5,13,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <6,14,u,u,7,15,u,u> -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm27 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm9 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm10 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <6,14,u,u,7,15,u,u> +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm20 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm18 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm22, %zmm19 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm9 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 1984(%r8) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1984(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 1920(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1856(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1792(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1728(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1600(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1536(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1472(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 1408(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 1344(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1280(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1216(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1152(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1024(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 896(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1856(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1664(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1600(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1536(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 1472(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 1408(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1344(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 1280(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1216(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 1152(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1088(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 832(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 704(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 640(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 128(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r8) -; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512BW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index 9f7c408e2a6bf..c0e1a17243a7e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -109,38 +109,38 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm2 ; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm5 -; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps 16(%rcx), %xmm7 -; SSE-NEXT: movaps (%r8), %xmm8 -; SSE-NEXT: movaps 16(%r8), %xmm9 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] +; SSE-NEXT: movaps (%rsi), %xmm4 +; SSE-NEXT: movaps 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps 16(%rdx), %xmm1 +; SSE-NEXT: movaps (%rcx), %xmm7 +; SSE-NEXT: movaps 16(%rcx), %xmm8 +; SSE-NEXT: movaps (%r8), %xmm9 +; SSE-NEXT: movaps 16(%r8), %xmm3 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, (%r9) -; SSE-NEXT: movaps %xmm4, 16(%r9) -; SSE-NEXT: movaps %xmm8, 32(%r9) -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps %xmm6, 64(%r9) +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps %xmm2, (%r9) +; SSE-NEXT: movaps %xmm5, 16(%r9) +; SSE-NEXT: movaps %xmm9, 32(%r9) +; SSE-NEXT: movaps %xmm6, 48(%r9) +; SSE-NEXT: movaps %xmm7, 64(%r9) ; SSE-NEXT: movaps %xmm0, 80(%r9) -; SSE-NEXT: movaps %xmm5, 96(%r9) -; SSE-NEXT: movaps %xmm9, 112(%r9) +; SSE-NEXT: movaps %xmm1, 96(%r9) +; SSE-NEXT: movaps %xmm3, 112(%r9) ; SSE-NEXT: movaps %xmm10, 128(%r9) -; SSE-NEXT: movaps %xmm7, 144(%r9) +; SSE-NEXT: movaps %xmm8, 144(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf4: @@ -679,51 +679,51 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movapd 80(%rdi), %xmm14 -; SSE-NEXT: movapd 80(%rsi), %xmm12 +; SSE-NEXT: movapd 80(%rsi), %xmm13 ; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 80(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movapd 80(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] ; SSE-NEXT: movapd 80(%rcx), %xmm9 ; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movapd 96(%rdi), %xmm11 -; SSE-NEXT: movapd 96(%rsi), %xmm7 -; SSE-NEXT: movapd %xmm11, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm7[0] -; SSE-NEXT: movapd 96(%r8), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movapd 96(%rsi), %xmm8 +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] +; SSE-NEXT: movapd 96(%r8), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd 96(%rdx), %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movapd 96(%rcx), %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 112(%rdi), %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movapd 96(%rcx), %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: movapd 112(%rdi), %xmm4 ; SSE-NEXT: movapd 112(%rsi), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm5 +; SSE-NEXT: movapd %xmm4, %xmm5 ; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movapd 112(%r8), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] +; SSE-NEXT: movapd 112(%r8), %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] ; SSE-NEXT: movapd 112(%rdx), %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 112(%rcx), %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movapd %xmm0, 624(%r9) ; SSE-NEXT: movapd %xmm2, 608(%r9) -; SSE-NEXT: movapd %xmm3, 592(%r9) +; SSE-NEXT: movapd %xmm4, 592(%r9) ; SSE-NEXT: movapd %xmm1, 576(%r9) ; SSE-NEXT: movapd %xmm5, 560(%r9) -; SSE-NEXT: movapd %xmm4, 544(%r9) -; SSE-NEXT: movapd %xmm7, 528(%r9) +; SSE-NEXT: movapd %xmm3, 544(%r9) +; SSE-NEXT: movapd %xmm8, 528(%r9) ; SSE-NEXT: movapd %xmm11, 512(%r9) ; SSE-NEXT: movapd %xmm6, 496(%r9) -; SSE-NEXT: movapd %xmm13, 480(%r9) +; SSE-NEXT: movapd %xmm12, 480(%r9) ; SSE-NEXT: movapd %xmm9, 464(%r9) -; SSE-NEXT: movapd %xmm12, 448(%r9) +; SSE-NEXT: movapd %xmm13, 448(%r9) ; SSE-NEXT: movapd %xmm14, 432(%r9) ; SSE-NEXT: movapd %xmm10, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -783,153 +783,149 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $216, %rsp -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm9[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm0[0],ymm14[1,2,3] +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm8[0],ymm0[1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0],ymm2[1,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm9[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm8[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm0 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm15, (%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 480(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 320(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 576(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 576(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 512(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 256(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 608(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 608(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 544(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -943,159 +939,158 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-LABEL: store_i64_stride5_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm9 ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],mem[0],ymm15[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm4[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm14[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 576(%r9) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm0 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm0[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm15 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm15[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm3, 576(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 512(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 352(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 256(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 256(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 448(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 608(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 608(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%r9) ; AVX2-ONLY-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1524,51 +1519,51 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movapd 208(%rdi), %xmm14 -; SSE-NEXT: movapd 208(%rsi), %xmm12 +; SSE-NEXT: movapd 208(%rsi), %xmm13 ; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 208(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movapd 208(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] ; SSE-NEXT: movapd 208(%rcx), %xmm9 ; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 224(%rsi), %xmm7 -; SSE-NEXT: movapd %xmm11, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm7[0] -; SSE-NEXT: movapd 224(%r8), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movapd 224(%rsi), %xmm8 +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] +; SSE-NEXT: movapd 224(%r8), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd 224(%rdx), %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movapd 224(%rcx), %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 240(%rdi), %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movapd 224(%rcx), %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: movapd 240(%rdi), %xmm4 ; SSE-NEXT: movapd 240(%rsi), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm5 +; SSE-NEXT: movapd %xmm4, %xmm5 ; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movapd 240(%r8), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] +; SSE-NEXT: movapd 240(%r8), %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] ; SSE-NEXT: movapd 240(%rdx), %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 240(%rcx), %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movapd %xmm0, 1264(%r9) ; SSE-NEXT: movapd %xmm2, 1248(%r9) -; SSE-NEXT: movapd %xmm3, 1232(%r9) +; SSE-NEXT: movapd %xmm4, 1232(%r9) ; SSE-NEXT: movapd %xmm1, 1216(%r9) ; SSE-NEXT: movapd %xmm5, 1200(%r9) -; SSE-NEXT: movapd %xmm4, 1184(%r9) -; SSE-NEXT: movapd %xmm7, 1168(%r9) +; SSE-NEXT: movapd %xmm3, 1184(%r9) +; SSE-NEXT: movapd %xmm8, 1168(%r9) ; SSE-NEXT: movapd %xmm11, 1152(%r9) ; SSE-NEXT: movapd %xmm6, 1136(%r9) -; SSE-NEXT: movapd %xmm13, 1120(%r9) +; SSE-NEXT: movapd %xmm12, 1120(%r9) ; SSE-NEXT: movapd %xmm9, 1104(%r9) -; SSE-NEXT: movapd %xmm12, 1088(%r9) +; SSE-NEXT: movapd %xmm13, 1088(%r9) ; SSE-NEXT: movapd %xmm14, 1072(%r9) ; SSE-NEXT: movapd %xmm10, 1056(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1707,17 +1702,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX1-ONLY-NEXT: subq $1048, %rsp # imm = 0x418 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] @@ -1726,26 +1721,25 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -1758,49 +1752,49 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1811,205 +1805,205 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm13[0],ymm9[0],ymm13[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm15[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm9[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm15[1],xmm8[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm15[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, (%rsp), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm9[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $252, (%rsp), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm8[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm13[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm13[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm13[2],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm8[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm2[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0],ymm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0],ymm12[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm14[2],ymm11[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm5[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0],ymm8[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1136(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 976(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 1120(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 816(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 816(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 800(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 336(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 656(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 640(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2030,9 +2024,9 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r9) @@ -2074,69 +2068,69 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: addq $1048, %rsp # imm = 0x418 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1128, %rsp # imm = 0x468 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm2, %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 @@ -2145,11 +2139,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 @@ -2209,188 +2203,186 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm5[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm5[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm5[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm5[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm2[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm7[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm10 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm10[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm4 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1184(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1152(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1120(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 1056(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1056(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 1024(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 992(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 992(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 896(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 864(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 832(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 896(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 864(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 832(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 736(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 736(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 704(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 576(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 576(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2407,7 +2399,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r9) @@ -2421,14 +2413,14 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 1088(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1088(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1248(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1248(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2442,201 +2434,201 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-LABEL: store_i64_stride5_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm22 ; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm25 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [13,5,13,5,13,5,13,5] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm26, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm28, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm19, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm17 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm23 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm25 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm13, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm13, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm16 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm20, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm28, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm31, %zmm1, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm31, %zmm29 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm31, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm31 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} ; AVX512F-NEXT: movb $-116, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 {%k3} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k3} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm20 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 ; AVX512F-NEXT: movb $8, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 {%k3} +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k3} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 {%k2} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm11, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1152(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 640(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 1216(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2644,201 +2636,201 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-LABEL: store_i64_stride5_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm22 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm31, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm31 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 {%k3} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k3} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1152(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 640(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3325,51 +3317,51 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movapd 464(%rdi), %xmm14 -; SSE-NEXT: movapd 464(%rsi), %xmm12 +; SSE-NEXT: movapd 464(%rsi), %xmm13 ; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm13[0] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 464(%r8), %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movapd 464(%rdx), %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] ; SSE-NEXT: movapd 464(%rcx), %xmm9 ; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movapd 480(%rdi), %xmm11 -; SSE-NEXT: movapd 480(%rsi), %xmm7 -; SSE-NEXT: movapd %xmm11, %xmm13 -; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm7[0] -; SSE-NEXT: movapd 480(%r8), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movapd 480(%rsi), %xmm8 +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm8[0] +; SSE-NEXT: movapd 480(%r8), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd 480(%rdx), %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movapd 480(%rcx), %xmm4 -; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 496(%rdi), %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: movapd 480(%rcx), %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: movapd 496(%rdi), %xmm4 ; SSE-NEXT: movapd 496(%rsi), %xmm2 -; SSE-NEXT: movapd %xmm3, %xmm5 +; SSE-NEXT: movapd %xmm4, %xmm5 ; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: movapd 496(%r8), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] +; SSE-NEXT: movapd 496(%r8), %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] ; SSE-NEXT: movapd 496(%rdx), %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 496(%rcx), %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movapd %xmm0, 2544(%r9) ; SSE-NEXT: movapd %xmm2, 2528(%r9) -; SSE-NEXT: movapd %xmm3, 2512(%r9) +; SSE-NEXT: movapd %xmm4, 2512(%r9) ; SSE-NEXT: movapd %xmm1, 2496(%r9) ; SSE-NEXT: movapd %xmm5, 2480(%r9) -; SSE-NEXT: movapd %xmm4, 2464(%r9) -; SSE-NEXT: movapd %xmm7, 2448(%r9) +; SSE-NEXT: movapd %xmm3, 2464(%r9) +; SSE-NEXT: movapd %xmm8, 2448(%r9) ; SSE-NEXT: movapd %xmm11, 2432(%r9) ; SSE-NEXT: movapd %xmm6, 2416(%r9) -; SSE-NEXT: movapd %xmm13, 2400(%r9) +; SSE-NEXT: movapd %xmm12, 2400(%r9) ; SSE-NEXT: movapd %xmm9, 2384(%r9) -; SSE-NEXT: movapd %xmm12, 2368(%r9) +; SSE-NEXT: movapd %xmm13, 2368(%r9) ; SSE-NEXT: movapd %xmm14, 2352(%r9) ; SSE-NEXT: movapd %xmm10, 2336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3668,17 +3660,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX1-ONLY-NEXT: subq $2264, %rsp # imm = 0x8D8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] @@ -3687,11 +3679,10 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3701,16 +3692,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -3730,18 +3721,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 288(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3757,7 +3746,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3780,50 +3769,50 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm13 ; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3842,52 +3831,52 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vbroadcastsd 264(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 328(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3906,106 +3895,107 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 392(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm15[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm13, %ymm9 +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm8[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm9[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0],ymm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm8[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] @@ -4031,12 +4021,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm7[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm7[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm7[3] @@ -4057,9 +4047,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 @@ -4071,13 +4061,13 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm4 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] @@ -4090,8 +4080,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovapd 304(%rsi), %xmm5 @@ -4159,9 +4148,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 432(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm3 @@ -4173,24 +4162,25 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 448(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm2 @@ -4201,16 +4191,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm10[1,2,3] +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] @@ -4227,105 +4218,104 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 1936(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1920(%r9) +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1936(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 1920(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2256(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 2240(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 2240(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2416(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 2400(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 2400(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2096(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 2080(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 2080(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1616(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1600(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1600(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 1776(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 1760(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1760(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1440(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 1440(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 1136(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 1120(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 1136(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 1120(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) @@ -4469,13 +4459,13 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $2280, %rsp # imm = 0x8E8 +; AVX1-ONLY-NEXT: addq $2264, %rsp # imm = 0x8D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2760, %rsp # imm = 0xAC8 +; AVX2-ONLY-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 @@ -4773,7 +4763,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 184(%rsi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm1 @@ -4811,71 +4801,71 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm13[1],ymm6[1],ymm13[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 280(%rsi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm8[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm15[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4883,18 +4873,18 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 @@ -4904,11 +4894,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 @@ -4920,12 +4910,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 @@ -4956,7 +4946,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 @@ -4975,130 +4965,130 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm2 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm15 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm15[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm11 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm11 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm12[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm11[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm0 ; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 2464(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 2432(%r9) @@ -5115,13 +5105,12 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2080(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 2016(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1984(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 1952(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1984(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1952(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 1856(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1824(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1856(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 1824(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1792(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5182,7 +5171,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r9) @@ -5190,7 +5179,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r9) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r9) @@ -5214,7 +5203,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 2368(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2368(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2048(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5229,7 +5219,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 2528(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 2528(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5244,447 +5234,444 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-ONLY-NEXT: addq $2760, %rsp # imm = 0xAC8 +; AVX2-ONLY-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: store_i64_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3080, %rsp # imm = 0xC08 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512F-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm6 +; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm25 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm26 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm19 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 +; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm18 +; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm29 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm9, %zmm28 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm26 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm19, %zmm12 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm17 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm21 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm11 ; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm11 -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm11 +; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm0, %zmm9 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} ; AVX512F-NEXT: movb $-116, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm19 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm15 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: movb $8, %al -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} -; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm28 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm22 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: movb $8, %al +; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k3} +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 {%k3} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k3} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm11 -; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 +; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm3, 2496(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 2432(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 2368(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 {%k2} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k3} +; AVX512F-NEXT: vmovdqa64 %zmm5, 2496(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 2432(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 2368(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm4, 2304(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 2240(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 2240(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm11, 2176(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, 2112(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, 2112(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm13, 2048(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 1984(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1920(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 1856(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 1792(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, 1728(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 1664(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 1600(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1536(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1472(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 1408(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1344(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1280(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 1984(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1920(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1856(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 1792(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 1728(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 1664(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1536(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 1472(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 1408(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1344(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1280(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 1216(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 1088(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 1024(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 896(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 832(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm30, 768(%r9) @@ -5692,7 +5679,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 704(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm12, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 512(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5701,7 +5688,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 256(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload @@ -5710,447 +5697,444 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX512F-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3080, %rsp # imm = 0xC08 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm20, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm8 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm6 +; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm27 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm26 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm19 +; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm18 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm29 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm28 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm12 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm21 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm7 ; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm11 -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm5 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} ; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm19 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: movb $8, %al -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm20 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 {%k3} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: movb $8, %al +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm11 -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 2496(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 2432(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 2368(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm5, 2496(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 2432(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 2368(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 2304(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 2240(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 2240(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 2176(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 2112(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 2112(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 2048(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1984(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1920(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1856(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1792(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1728(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1664(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 1600(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1536(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1472(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1408(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1344(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1280(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1984(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1920(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1856(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 1792(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1728(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 1664(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1536(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 1472(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1408(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1344(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1280(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1088(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 1024(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 832(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm30, 768(%r9) @@ -6158,7 +6142,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 704(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 512(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6167,7 +6151,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 256(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload @@ -6176,7 +6160,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX512BW-NEXT: addq $3144, %rsp # imm = 0xC48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index 71ebeb1d24afa..f5cd64c698d5b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -310,98 +310,99 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps (%rsi), %xmm9 -; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movaps 32(%rsi), %xmm12 -; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps 32(%rdx), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps 16(%rsi), %xmm12 +; SSE-NEXT: movaps 32(%rsi), %xmm14 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm6 +; SSE-NEXT: movaps 32(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm10 -; SSE-NEXT: movaps 16(%rcx), %xmm14 -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm8 +; SSE-NEXT: movaps 16(%rcx), %xmm13 +; SSE-NEXT: movaps (%r8), %xmm7 +; SSE-NEXT: movaps 16(%r8), %xmm9 ; SSE-NEXT: movaps (%r9), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm15 -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; SSE-NEXT: movaps 16(%r9), %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm15[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r8), %xmm12 +; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm13[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r8), %xmm5 ; SSE-NEXT: movaps 32(%r9), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps 48(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: movaps 48(%rdx), %xmm2 -; SSE-NEXT: movaps 48(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: movaps 48(%r8), %xmm3 -; SSE-NEXT: movaps 48(%r9), %xmm4 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps 48(%rdx), %xmm1 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps 48(%r9), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 368(%rax) -; SSE-NEXT: movaps %xmm2, 352(%rax) -; SSE-NEXT: movaps %xmm5, 336(%rax) -; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps %xmm1, 304(%rax) -; SSE-NEXT: movaps %xmm6, 288(%rax) -; SSE-NEXT: movaps %xmm12, 272(%rax) +; SSE-NEXT: movaps %xmm0, 368(%rax) +; SSE-NEXT: movaps %xmm1, 352(%rax) +; SSE-NEXT: movaps %xmm6, 336(%rax) +; SSE-NEXT: movaps %xmm2, 320(%rax) +; SSE-NEXT: movaps %xmm4, 304(%rax) +; SSE-NEXT: movaps %xmm7, 288(%rax) +; SSE-NEXT: movaps %xmm5, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) -; SSE-NEXT: movaps %xmm7, 224(%rax) -; SSE-NEXT: movaps %xmm8, 208(%rax) +; SSE-NEXT: movaps %xmm8, 224(%rax) +; SSE-NEXT: movaps %xmm14, 208(%rax) ; SSE-NEXT: movaps %xmm15, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps %xmm9, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps %xmm14, 128(%rax) -; SSE-NEXT: movaps %xmm13, 112(%rax) +; SSE-NEXT: movaps %xmm13, 128(%rax) +; SSE-NEXT: movaps %xmm12, 112(%rax) ; SSE-NEXT: movaps %xmm11, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) @@ -410,7 +411,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps %xmm9, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $24, %rsp @@ -418,9 +420,9 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -433,38 +435,38 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm7 ; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm7, %ymm10 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm14[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm12[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm11[0],ymm13[0],ymm11[2],ymm13[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm11[0],ymm14[0],ymm11[2],ymm14[3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 @@ -485,8 +487,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm14[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm12[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0] ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] @@ -495,13 +497,13 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm11, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) @@ -513,82 +515,82 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-LABEL: store_i64_stride6_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm4[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm3[0,0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm9[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1],ymm2[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm12[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm11[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm11[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm7[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm3[0,1],ymm13[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm7[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm14[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm10 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm10[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm8[1],ymm13[1],ymm8[3],ymm13[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vzeroupper @@ -791,47 +793,47 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps (%rdi), %xmm7 ; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rdi), %xmm9 -; SSE-NEXT: movaps (%rsi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm3 ; SSE-NEXT: movaps 16(%rsi), %xmm1 ; SSE-NEXT: movaps 32(%rsi), %xmm0 ; SSE-NEXT: movaps (%rdx), %xmm10 ; SSE-NEXT: movaps 16(%rdx), %xmm11 ; SSE-NEXT: movaps 32(%rdx), %xmm12 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps 16(%rcx), %xmm2 ; SSE-NEXT: movaps (%r8), %xmm13 ; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r9), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps 16(%r9), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -899,63 +901,63 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 80(%rdx), %xmm11 +; SSE-NEXT: movaps 80(%rdx), %xmm12 ; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 80(%r8), %xmm13 -; SSE-NEXT: movaps 80(%r9), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdx), %xmm9 -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps 80(%r8), %xmm14 +; SSE-NEXT: movaps 80(%r9), %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps 96(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 96(%r8), %xmm7 -; SSE-NEXT: movaps 96(%r9), %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps 112(%rdx), %xmm4 -; SSE-NEXT: movaps 112(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movaps 112(%r8), %xmm2 -; SSE-NEXT: movaps 112(%r9), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movaps 96(%rdx), %xmm10 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 96(%r8), %xmm5 +; SSE-NEXT: movaps 96(%r9), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps 112(%rdx), %xmm1 +; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 112(%r8), %xmm0 +; SSE-NEXT: movaps 112(%r9), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 752(%rax) -; SSE-NEXT: movaps %xmm4, 736(%rax) -; SSE-NEXT: movaps %xmm1, 720(%rax) -; SSE-NEXT: movaps %xmm0, 704(%rax) -; SSE-NEXT: movaps %xmm3, 688(%rax) -; SSE-NEXT: movaps %xmm6, 672(%rax) -; SSE-NEXT: movaps %xmm7, 656(%rax) -; SSE-NEXT: movaps %xmm9, 640(%rax) -; SSE-NEXT: movaps %xmm12, 624(%rax) +; SSE-NEXT: movaps %xmm0, 752(%rax) +; SSE-NEXT: movaps %xmm1, 736(%rax) +; SSE-NEXT: movaps %xmm6, 720(%rax) +; SSE-NEXT: movaps %xmm2, 704(%rax) +; SSE-NEXT: movaps %xmm4, 688(%rax) +; SSE-NEXT: movaps %xmm7, 672(%rax) +; SSE-NEXT: movaps %xmm5, 656(%rax) +; SSE-NEXT: movaps %xmm10, 640(%rax) +; SSE-NEXT: movaps %xmm9, 624(%rax) ; SSE-NEXT: movaps %xmm8, 608(%rax) -; SSE-NEXT: movaps %xmm10, 592(%rax) -; SSE-NEXT: movaps %xmm14, 576(%rax) -; SSE-NEXT: movaps %xmm13, 560(%rax) -; SSE-NEXT: movaps %xmm11, 544(%rax) +; SSE-NEXT: movaps %xmm11, 592(%rax) +; SSE-NEXT: movaps %xmm13, 576(%rax) +; SSE-NEXT: movaps %xmm14, 560(%rax) +; SSE-NEXT: movaps %xmm12, 544(%rax) ; SSE-NEXT: movaps %xmm15, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%rax) @@ -1028,8 +1030,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm13 +; AVX1-ONLY-NEXT: subq $440, %rsp # imm = 0x1B8 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 @@ -1039,10 +1041,10 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 @@ -1065,153 +1067,153 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],ymm11[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm11[0],ymm1[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 592(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 384(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 704(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 512(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 592(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 704(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 512(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 736(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 672(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -1219,13 +1221,13 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 608(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1238,7 +1240,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: addq $440, %rsp # imm = 0x1B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1257,24 +1259,24 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm15[1],xmm3[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[0,1],ymm3[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1286,69 +1288,69 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = xmm9[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm4[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm2[1],xmm8[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[0,1],ymm9[0,1] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[0,1],ymm10[0,1] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = xmm0[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm10[1] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm15, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] @@ -1357,8 +1359,8 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm6 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] @@ -1366,57 +1368,57 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm6 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm10 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm11, 736(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 672(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 544(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 512(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 672(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 512(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 480(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 320(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) @@ -2593,8 +2595,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rcx), %xmm3 ; SSE-NEXT: movaps (%r8), %xmm13 ; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r9), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps 16(%r9), %xmm5 ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2606,9 +2608,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] @@ -2621,9 +2623,9 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -2859,63 +2861,63 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 208(%rdx), %xmm13 -; SSE-NEXT: movaps 208(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps 208(%r8), %xmm10 -; SSE-NEXT: movaps 208(%r9), %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps 224(%rdi), %xmm12 +; SSE-NEXT: movaps 208(%rdx), %xmm12 +; SSE-NEXT: movaps 208(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 208(%r8), %xmm11 +; SSE-NEXT: movaps 208(%r9), %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm13 ; SSE-NEXT: movaps 224(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdx), %xmm9 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 224(%r8), %xmm6 -; SSE-NEXT: movaps 224(%r9), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps 224(%r8), %xmm5 +; SSE-NEXT: movaps 224(%r9), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps 240(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps 240(%rsi), %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps 240(%rdx), %xmm4 -; SSE-NEXT: movaps 240(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movaps 240(%r8), %xmm2 -; SSE-NEXT: movaps 240(%r9), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movaps 240(%rdx), %xmm1 +; SSE-NEXT: movaps 240(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 240(%r8), %xmm0 +; SSE-NEXT: movaps 240(%r9), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 1520(%rax) -; SSE-NEXT: movaps %xmm4, 1504(%rax) -; SSE-NEXT: movaps %xmm1, 1488(%rax) -; SSE-NEXT: movaps %xmm0, 1472(%rax) -; SSE-NEXT: movaps %xmm3, 1456(%rax) +; SSE-NEXT: movaps %xmm0, 1520(%rax) +; SSE-NEXT: movaps %xmm1, 1504(%rax) +; SSE-NEXT: movaps %xmm6, 1488(%rax) +; SSE-NEXT: movaps %xmm2, 1472(%rax) +; SSE-NEXT: movaps %xmm4, 1456(%rax) ; SSE-NEXT: movaps %xmm7, 1440(%rax) -; SSE-NEXT: movaps %xmm6, 1424(%rax) +; SSE-NEXT: movaps %xmm5, 1424(%rax) ; SSE-NEXT: movaps %xmm9, 1408(%rax) -; SSE-NEXT: movaps %xmm12, 1392(%rax) +; SSE-NEXT: movaps %xmm13, 1392(%rax) ; SSE-NEXT: movaps %xmm8, 1376(%rax) -; SSE-NEXT: movaps %xmm11, 1360(%rax) +; SSE-NEXT: movaps %xmm10, 1360(%rax) ; SSE-NEXT: movaps %xmm14, 1344(%rax) -; SSE-NEXT: movaps %xmm10, 1328(%rax) -; SSE-NEXT: movaps %xmm13, 1312(%rax) +; SSE-NEXT: movaps %xmm11, 1328(%rax) +; SSE-NEXT: movaps %xmm12, 1312(%rax) ; SSE-NEXT: movaps %xmm15, 1296(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1280(%rax) @@ -3084,340 +3086,342 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1592, %rsp # imm = 0x638 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: subq $1608, %rsp # imm = 0x648 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm13[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm4[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm13[0],ymm12[0],ymm13[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm4[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm14[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm10[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm0[0],mem[0] @@ -3466,8 +3470,8 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm11, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 400(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 784(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 784(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 768(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3548,7 +3552,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $1592, %rsp # imm = 0x638 +; AVX1-ONLY-NEXT: addq $1608, %rsp # imm = 0x648 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3812,7 +3816,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 @@ -3829,106 +3833,106 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 144(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%r9), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm15[0],ymm2[2],ymm15[2] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1472(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1440(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1472(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1440(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1344(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1312(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1280(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 1248(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1280(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1248(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 1120(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1088(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1056(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1120(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1088(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1056(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 928(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 896(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 864(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 928(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 896(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 864(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 736(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 736(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 544(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3942,7 +3946,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3988,281 +3992,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $12, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $48, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $16, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -4270,281 +4271,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $12, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $48, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $16, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -4552,281 +4550,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $12, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQ-SLOW-NEXT: movb $48, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $16, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512DQ-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq @@ -4834,281 +4829,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-LABEL: store_i64_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $12, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQ-FAST-NEXT: movb $48, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $16, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -5116,281 +5108,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $16, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq @@ -5398,281 +5387,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $12, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $48, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $16, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512BW-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq @@ -5680,281 +5666,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf32: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $12, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $48, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $16, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512DQBW-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq @@ -5962,281 +5945,278 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf32: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm25 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm29, %zmm8, %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm28, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm12, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $12, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} ; AVX512DQBW-FAST-NEXT: movb $48, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm8, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $16, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm29, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm30 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm30 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm17, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm22, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm17, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm7 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512DQBW-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq @@ -6870,70 +6850,70 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm14 +; SSE-NEXT: movaps 464(%rdi), %xmm15 ; SSE-NEXT: movaps 464(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdx), %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 464(%rdx), %xmm14 ; SSE-NEXT: movaps 464(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 464(%r8), %xmm11 ; SSE-NEXT: movaps 464(%r9), %xmm0 ; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm9 +; SSE-NEXT: movaps 480(%rdi), %xmm12 ; SSE-NEXT: movaps 480(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdx), %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 480(%rdx), %xmm8 ; SSE-NEXT: movaps 480(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 480(%r8), %xmm6 -; SSE-NEXT: movaps 480(%r9), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 480(%r8), %xmm5 +; SSE-NEXT: movaps 480(%r9), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdi), %xmm6 +; SSE-NEXT: movaps 496(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps 496(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 496(%rdx), %xmm3 -; SSE-NEXT: movaps 496(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps 496(%r8), %xmm2 -; SSE-NEXT: movaps 496(%r9), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movaps 496(%rdx), %xmm1 +; SSE-NEXT: movaps 496(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 496(%r8), %xmm0 +; SSE-NEXT: movaps 496(%r9), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 3056(%rax) -; SSE-NEXT: movaps %xmm3, 3040(%rax) -; SSE-NEXT: movaps %xmm1, 3024(%rax) -; SSE-NEXT: movaps %xmm0, 3008(%rax) +; SSE-NEXT: movaps %xmm0, 3056(%rax) +; SSE-NEXT: movaps %xmm1, 3040(%rax) +; SSE-NEXT: movaps %xmm6, 3024(%rax) +; SSE-NEXT: movaps %xmm2, 3008(%rax) ; SSE-NEXT: movaps %xmm4, 2992(%rax) ; SSE-NEXT: movaps %xmm7, 2976(%rax) -; SSE-NEXT: movaps %xmm6, 2960(%rax) -; SSE-NEXT: movaps %xmm10, 2944(%rax) -; SSE-NEXT: movaps %xmm9, 2928(%rax) -; SSE-NEXT: movaps %xmm8, 2912(%rax) -; SSE-NEXT: movaps %xmm12, 2896(%rax) +; SSE-NEXT: movaps %xmm5, 2960(%rax) +; SSE-NEXT: movaps %xmm8, 2944(%rax) +; SSE-NEXT: movaps %xmm12, 2928(%rax) +; SSE-NEXT: movaps %xmm9, 2912(%rax) +; SSE-NEXT: movaps %xmm10, 2896(%rax) ; SSE-NEXT: movaps %xmm13, 2880(%rax) ; SSE-NEXT: movaps %xmm11, 2864(%rax) -; SSE-NEXT: movaps %xmm15, 2848(%rax) -; SSE-NEXT: movaps %xmm14, 2832(%rax) +; SSE-NEXT: movaps %xmm14, 2848(%rax) +; SSE-NEXT: movaps %xmm15, 2832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 2816(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7294,644 +7274,644 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-LABEL: store_i64_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm6 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm2[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 352(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 352(%r9), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%r9), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %ymm13 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm9[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 272(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 272(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 368(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 400(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] @@ -7942,7 +7922,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7954,7 +7934,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7966,7 +7946,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8572,10 +8552,10 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -8630,8 +8610,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm3 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8708,9 +8688,9 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm0 @@ -8775,8 +8755,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 @@ -8958,35 +8937,36 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 432(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 472(%r8), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm4 @@ -9007,15 +8987,13 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9031,7 +9009,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3040(%rax) @@ -9041,11 +9019,12 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2880(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 2848(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 2816(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 2784(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 2784(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2688(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 2656(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 2624(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 2656(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2592(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9106,8 +9085,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 928(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9214,1347 +9192,5421 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i64_stride6_vf64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3272, %rsp # imm = 0xCC8 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm28 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm23 -; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm19 -; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm10 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] -; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm28, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,9,2,10,1,9,2,10] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,13,6,14,5,13,6,14] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm12, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm27 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512F-NEXT: vpermi2q %zmm23, %zmm24, %zmm19 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm25, %zmm13 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm10 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,1,9,0,8,1,9] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm25, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm25, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm14, %zmm25 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm14, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 -; AVX512F-NEXT: movb $12, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} -; AVX512F-NEXT: movb $48, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm6 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm9 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm12 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm17 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm21 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm21 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm8 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm16 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm16 = zmm15[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: movb $16, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm5 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm24 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm23 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm23 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm22 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm18 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm18 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm18 -; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm16 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm8 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm7 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm9 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm9 -; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm10 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm5 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm27 {%k2} -; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa 256(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 320(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 384(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 448(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm17, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm20, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm30, %zmm31 -; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm20 -; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm11, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm17 -; AVX512F-NEXT: vinserti32x4 $2, 256(%r8), %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm30, %zmm12 -; AVX512F-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm30, %zmm11 -; AVX512F-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm30, %zmm6 -; AVX512F-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm30, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm30, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm30, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm30, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm5 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm5, 3008(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 2880(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 2816(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2752(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 2624(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 2496(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 2432(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 2240(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 2112(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 2048(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1856(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1664(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 1280(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 2688(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 2304(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 1920(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm31, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512F-NEXT: addq $3272, %rsp # imm = 0xCC8 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf64: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: movb $12, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: movb $48, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: movb $16, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq +; +; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: movb $12, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: movb $48, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $16, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf64: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: movb $12, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: movb $48, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $16, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i64_stride6_vf64: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: movb $12, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: movb $48, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $16, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: movb $12, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movb $16, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movb $12, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: movb $48, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $16, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: movb $12, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: movb $48, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $16, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i64_stride6_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3272, %rsp # imm = 0xCC8 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm28 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm10 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] -; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm27 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm23, %zmm24, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm25, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm25 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm14, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm14 -; AVX512BW-NEXT: movb $12, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} -; AVX512BW-NEXT: movb $48, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm17 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm21 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm21 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm8 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm16 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm16 = zmm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: movb $16, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm5 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm24 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm23 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm23 = zmm31[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm18 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm18 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm18 -; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm16 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm8 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm7 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm9 = zmm28[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm10 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm5 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm27 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 256(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 320(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 384(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 448(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm17, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm25 -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm20, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm31 -; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm30, %zmm20 -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm17 -; AVX512BW-NEXT: vinserti32x4 $2, 256(%r8), %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm30, %zmm12 -; AVX512BW-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm30, %zmm11 -; AVX512BW-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm6 -; AVX512BW-NEXT: vinserti32x4 $2, 448(%r8), %zmm4, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm30, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm30, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm30, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm30, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm5 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm5, 3008(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 2880(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 2816(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2752(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 2624(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2560(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 2496(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 2432(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 2240(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 2048(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1856(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1664(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1280(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 1088(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512BW-NEXT: addq $3272, %rsp # imm = 0xCC8 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $3400, %rsp # imm = 0xD48 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm7, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm23, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm10, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm23, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm23, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: movb $12, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: movb $48, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,9,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $16, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm6 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm3 = zmm29[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm31[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm14, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm16, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm30, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm26 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 256(%r8), %zmm31, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm24 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 320(%r8), %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm23 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 384(%r8), %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm21, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm29, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm31, %zmm17 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 2496(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 2176(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2048(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i64>, ptr %in.vecptr2, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 59857b1a3671e..7f999e06b304c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -420,14 +420,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd (%rcx), %xmm7 ; SSE-NEXT: movapd 16(%rcx), %xmm11 ; SSE-NEXT: movapd (%r8), %xmm9 -; SSE-NEXT: movapd 16(%r8), %xmm15 +; SSE-NEXT: movapd 16(%r8), %xmm14 ; SSE-NEXT: movapd (%r9), %xmm12 ; SSE-NEXT: movapd 16(%r9), %xmm13 ; SSE-NEXT: movapd (%rax), %xmm0 ; SSE-NEXT: movapd 16(%rax), %xmm1 -; SSE-NEXT: movapd %xmm2, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm3[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm15 +; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] @@ -440,18 +440,18 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm9, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm5, %xmm14 -; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm6[0] +; SSE-NEXT: movapd %xmm5, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm11[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm15[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm14[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm13[0] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movapd 32(%rsi), %xmm12 ; SSE-NEXT: movapd %xmm10, %xmm15 @@ -461,38 +461,38 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 32(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movapd 32(%rcx), %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movapd 32(%rcx), %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; SSE-NEXT: movapd 32(%r8), %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movapd 32(%r9), %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movapd 48(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movapd 32(%r9), %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movapd 48(%rdi), %xmm5 ; SSE-NEXT: movapd 48(%rsi), %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movapd 48(%rax), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 48(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 48(%rcx), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd 48(%r8), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movapd 48(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd 48(%r8), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 48(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 432(%rax) -; SSE-NEXT: movapd %xmm1, 416(%rax) +; SSE-NEXT: movapd %xmm2, 416(%rax) ; SSE-NEXT: movapd %xmm4, 400(%rax) -; SSE-NEXT: movapd %xmm6, 384(%rax) -; SSE-NEXT: movapd %xmm2, 368(%rax) +; SSE-NEXT: movapd %xmm5, 384(%rax) +; SSE-NEXT: movapd %xmm1, 368(%rax) ; SSE-NEXT: movapd %xmm3, 352(%rax) -; SSE-NEXT: movapd %xmm8, 336(%rax) -; SSE-NEXT: movapd %xmm5, 320(%rax) -; SSE-NEXT: movapd %xmm7, 304(%rax) +; SSE-NEXT: movapd %xmm7, 336(%rax) +; SSE-NEXT: movapd %xmm6, 320(%rax) +; SSE-NEXT: movapd %xmm8, 304(%rax) ; SSE-NEXT: movapd %xmm12, 288(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) @@ -506,11 +506,11 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movapd %xmm14, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movapd %xmm14, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -538,7 +538,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] @@ -547,87 +547,87 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm15[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm10, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm8[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm8, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[2],ymm14[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0],ymm0[1],ymm14[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm14[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rax) @@ -647,14 +647,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm12 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] @@ -670,7 +670,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],mem[0],ymm10[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],mem[0],ymm9[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] @@ -681,31 +681,31 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm12, %ymm12 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] @@ -719,7 +719,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm2, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm6 @@ -727,11 +727,11 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 384(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) @@ -752,11 +752,11 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] @@ -765,14 +765,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 @@ -783,7 +783,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 @@ -799,10 +799,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 @@ -815,14 +815,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 @@ -833,24 +833,24 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm10, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] ; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -860,7 +860,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) @@ -877,10 +877,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] @@ -896,7 +896,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 @@ -907,7 +907,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 @@ -923,10 +923,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 @@ -936,7 +936,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] ; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -953,12 +953,12 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 @@ -969,18 +969,18 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movb $120, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -1000,78 +1000,78 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] ; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512DQ-SLOW-NEXT: movb $96, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX512DQ-SLOW-NEXT: movb $28, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 ; AVX512DQ-SLOW-NEXT: movb $56, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] ; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 @@ -1088,32 +1088,32 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 ; AVX512DQ-SLOW-NEXT: movb $120, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: movb $48, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -1124,89 +1124,89 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm3 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] ; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-FAST-NEXT: movb $48, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,7,u> -; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 ; AVX512DQ-FAST-NEXT: movb $96, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQ-FAST-NEXT: movb $28, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512DQ-FAST-NEXT: movb $56, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 @@ -1221,20 +1221,20 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 ; AVX512DQ-FAST-NEXT: movb $120, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -1246,11 +1246,11 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] @@ -1259,14 +1259,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 @@ -1277,7 +1277,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 @@ -1293,10 +1293,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 @@ -1309,14 +1309,14 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 @@ -1327,24 +1327,24 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm10, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] ; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -1354,7 +1354,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) @@ -1371,10 +1371,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] @@ -1390,7 +1390,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 @@ -1401,7 +1401,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 @@ -1417,10 +1417,10 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 @@ -1430,7 +1430,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] @@ -1447,12 +1447,12 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 @@ -1463,18 +1463,18 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -1494,78 +1494,78 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 ; AVX512DQBW-SLOW-NEXT: movb $96, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 ; AVX512DQBW-SLOW-NEXT: movb $56, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 @@ -1582,32 +1582,32 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 ; AVX512DQBW-SLOW-NEXT: movb $120, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $48, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -1618,89 +1618,89 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm3 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 ; AVX512DQBW-FAST-NEXT: movb $48, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm10 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,7,u> -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 ; AVX512DQBW-FAST-NEXT: movb $96, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX512DQBW-FAST-NEXT: movb $28, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 ; AVX512DQBW-FAST-NEXT: movb $56, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 @@ -1715,20 +1715,20 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: movb $120, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper @@ -1897,38 +1897,38 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] ; SSE-NEXT: movapd 96(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movapd 96(%rcx), %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movapd 96(%rcx), %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; SSE-NEXT: movapd 96(%r8), %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movapd 96(%r9), %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movapd 112(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movapd 96(%r9), %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movapd 112(%rdi), %xmm5 ; SSE-NEXT: movapd 112(%rsi), %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movapd 112(%rax), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 112(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 112(%rcx), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd 112(%r8), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movapd 112(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd 112(%r8), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 112(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 880(%rax) -; SSE-NEXT: movapd %xmm1, 864(%rax) +; SSE-NEXT: movapd %xmm2, 864(%rax) ; SSE-NEXT: movapd %xmm4, 848(%rax) -; SSE-NEXT: movapd %xmm6, 832(%rax) -; SSE-NEXT: movapd %xmm2, 816(%rax) +; SSE-NEXT: movapd %xmm5, 832(%rax) +; SSE-NEXT: movapd %xmm1, 816(%rax) ; SSE-NEXT: movapd %xmm3, 800(%rax) -; SSE-NEXT: movapd %xmm8, 784(%rax) -; SSE-NEXT: movapd %xmm5, 768(%rax) -; SSE-NEXT: movapd %xmm7, 752(%rax) +; SSE-NEXT: movapd %xmm7, 784(%rax) +; SSE-NEXT: movapd %xmm6, 768(%rax) +; SSE-NEXT: movapd %xmm8, 752(%rax) ; SSE-NEXT: movapd %xmm12, 736(%rax) ; SSE-NEXT: movapd %xmm15, 720(%rax) ; SSE-NEXT: movapd %xmm9, 704(%rax) @@ -2115,8 +2115,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2130,10 +2130,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 @@ -2153,14 +2153,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] @@ -2229,10 +2229,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm6, 864(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 800(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 736(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -2269,74 +2269,74 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: subq $552, %rsp # imm = 0x228 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm14[1],ymm7[3],ymm14[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm15[1],ymm8[3],ymm15[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 80(%rax), %xmm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm5[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2346,1858 +2346,1847 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm9 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm8, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1],ymm0[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm3, 800(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 768(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 576(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 544(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 512(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 832(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 736(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 704(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 800(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 576(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 544(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 512(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 448(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 832(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 704(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 672(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 640(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 608(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 864(%rax) -; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm21, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[2,3,2,3],zmm23[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm30[2,3,2,3],zmm22[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm22, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: movb $48, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm18[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm19[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm0, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm18, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm22, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 ; AVX512F-ONLY-FAST-NEXT: movb $96, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 ; AVX512F-ONLY-FAST-NEXT: movb $120, %sil -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 ; AVX512F-ONLY-FAST-NEXT: movb $24, %dil ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %dil ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $-61, %dil ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm24, %ymm5, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm22, %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm23 {%k3} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm17[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $28, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[2,3,2,3],zmm10[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm8[0],ymm22[0],ymm8[2],ymm22[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[2,3,2,3],zmm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $56, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm26, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $64, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] ; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm24 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: movb $96, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] ; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm21, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm15, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] ; AVX512DQ-SLOW-NEXT: movb $28, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm30[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm23, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] ; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 ; AVX512DQ-SLOW-NEXT: movb $48, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm4[0],zmm0[0],zmm4[2],zmm0[2],zmm4[4],zmm0[4],zmm4[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: movb $12, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} ; AVX512DQ-SLOW-NEXT: movb $112, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm24, %zmm1 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k7 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k5} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k6} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} ; AVX512DQ-SLOW-NEXT: movb $56, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm8 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k3} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm28 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k5} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k6} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k7} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm24 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm6 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 ; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm17, %zmm6 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 ; AVX512DQ-FAST-NEXT: movb $96, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 ; AVX512DQ-FAST-NEXT: movb $120, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm24[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm25, %zmm8 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] ; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 ; AVX512DQ-FAST-NEXT: movb $24, %dil ; AVX512DQ-FAST-NEXT: kmovw %edi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} ; AVX512DQ-FAST-NEXT: movb $-31, %dil ; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm19 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm20 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $-61, %dil ; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 ; AVX512DQ-FAST-NEXT: movb $48, %sil ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k3} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm7 -; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm5, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm20 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQ-FAST-NEXT: vpermi2q %ymm21, %ymm7, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm22 {%k3} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm18[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: movb $28, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[2,3,2,3],zmm10[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm21[0],ymm7[2],ymm21[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[2,3,2,3],zmm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: movb $64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} ; AVX512DQ-FAST-NEXT: movb $56, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm25, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm21, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[2,3,2,3],zmm23[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm30[2,3,2,3],zmm22[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm22, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm24, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm18[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm19[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm0, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm18, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm22, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %dil ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %dil ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %dil ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm24, %ymm5, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm22, %ymm8, %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm23 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm17[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm24, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $28, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[2,3,2,3],zmm10[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm8[0],ymm22[0],ymm8[2],ymm22[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[2,3,2,3],zmm13[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm26, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $64, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: movb $96, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm21, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm15, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm0, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm30[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm23, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 ; AVX512DQBW-SLOW-NEXT: movb $48, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm4[0],zmm0[0],zmm4[2],zmm0[2],zmm4[4],zmm0[4],zmm4[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm24, %zmm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k7 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k5} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k6} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $56, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm8 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k3} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm28 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k5} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k7} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm24 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm6 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm17 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm17, %zmm6 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm22 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 ; AVX512DQBW-FAST-NEXT: movb $96, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 ; AVX512DQBW-FAST-NEXT: movb $120, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm24[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm25, %zmm8 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] ; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 ; AVX512DQBW-FAST-NEXT: movb $24, %dil ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} ; AVX512DQBW-FAST-NEXT: movb $-31, %dil ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm20 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $-61, %dil ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 ; AVX512DQBW-FAST-NEXT: movb $48, %sil ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k3} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm23 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm5, %ymm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm20 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm21, %ymm7, %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm22 {%k3} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm18[0,1,2,3],zmm16[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: movb $28, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[2,3,2,3],zmm10[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm21[0],ymm7[2],ymm21[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[2,3,2,3],zmm13[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: movb $64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} ; AVX512DQBW-FAST-NEXT: movb $56, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm25, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -4540,38 +4529,38 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] ; SSE-NEXT: movapd 224(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movapd 224(%rcx), %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movapd 224(%rcx), %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; SSE-NEXT: movapd 224(%r8), %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movapd 224(%r9), %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movapd 240(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movapd 224(%r9), %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movapd 240(%rdi), %xmm5 ; SSE-NEXT: movapd 240(%rsi), %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movapd 240(%rax), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 240(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 240(%rcx), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd 240(%r8), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movapd 240(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd 240(%r8), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 240(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 1776(%rax) -; SSE-NEXT: movapd %xmm1, 1760(%rax) +; SSE-NEXT: movapd %xmm2, 1760(%rax) ; SSE-NEXT: movapd %xmm4, 1744(%rax) -; SSE-NEXT: movapd %xmm6, 1728(%rax) -; SSE-NEXT: movapd %xmm2, 1712(%rax) +; SSE-NEXT: movapd %xmm5, 1728(%rax) +; SSE-NEXT: movapd %xmm1, 1712(%rax) ; SSE-NEXT: movapd %xmm3, 1696(%rax) -; SSE-NEXT: movapd %xmm8, 1680(%rax) -; SSE-NEXT: movapd %xmm5, 1664(%rax) -; SSE-NEXT: movapd %xmm7, 1648(%rax) +; SSE-NEXT: movapd %xmm7, 1680(%rax) +; SSE-NEXT: movapd %xmm6, 1664(%rax) +; SSE-NEXT: movapd %xmm8, 1648(%rax) ; SSE-NEXT: movapd %xmm12, 1632(%rax) ; SSE-NEXT: movapd %xmm15, 1616(%rax) ; SSE-NEXT: movapd %xmm9, 1600(%rax) @@ -4787,9 +4776,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 @@ -4876,8 +4864,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] @@ -4968,8 +4957,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] @@ -5061,14 +5050,14 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] @@ -5082,69 +5071,69 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 240(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm10[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 @@ -5162,13 +5151,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %xmm15, 912(%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 896(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 1760(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 1728(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 1696(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 1664(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 1728(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 1696(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 1664(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 1632(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1600(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 1568(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1536(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 1536(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5268,74 +5257,73 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm5 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm4 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5371,11 +5359,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5411,11 +5399,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5442,17 +5430,19 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5466,13 +5456,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm11 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -5486,9 +5476,9 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5500,8 +5490,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 @@ -5509,168 +5499,166 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm4 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm4[0],ymm10[0],ymm4[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm4, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm5[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm7 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1760(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1728(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1760(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 1728(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1664(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1664(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1568(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 1536(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1536(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5679,10 +5667,10 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1376(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1376(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1344(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1312(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 1312(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5693,8 +5681,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1120(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1088(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1120(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1088(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5703,7 +5691,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 928(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 928(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 864(%rcx) @@ -5717,7 +5705,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 672(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5769,443 +5757,445 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm17[0],ymm1[2],ymm17[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm20, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm16[0],ymm19[2],ymm16[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm16, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm20, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm24, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm30, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm31, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm8[0],zmm10[0],zmm8[2],zmm10[2],zmm8[4],zmm10[4],zmm8[6],zmm10[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm12[0],zmm9[0],zmm12[2],zmm9[2],zmm12[4],zmm9[4],zmm12[6],zmm9[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm11[0],zmm25[0],zmm11[2],zmm25[2],zmm11[4],zmm25[4],zmm11[6],zmm25[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm14[0],zmm0[0],zmm14[2],zmm0[2],zmm14[4],zmm0[4],zmm14[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm20, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm30 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm12 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm20, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm5 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm26[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm10 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm22[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm18, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm16, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm16[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm16 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm21, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: movb $64, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512F-ONLY-SLOW-NEXT: movb $8, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm14, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm15, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -6214,440 +6204,439 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm28, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm28, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm26[0],ymm20[2],ymm26[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm26, %ymm28, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm16, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm3[2,3,2,3],zmm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm1, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm20, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm25 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm19 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k5} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k5} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm28 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm15 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm15 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm9 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm10[2,3,2,3],zmm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512F-ONLY-FAST-NEXT: movb $64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $8, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) ; AVX512F-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -6656,876 +6645,879 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 ; AVX512DQ-SLOW-NEXT: movb $96, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm16, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm16[0],ymm1[2],ymm16[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX512DQ-SLOW-NEXT: movb $28, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] ; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm20, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm20[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm29, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm17, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $48, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] ; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm11[0],zmm10[0],zmm11[2],zmm10[2],zmm11[4],zmm10[4],zmm11[6],zmm10[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm22, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm14, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm14[0],zmm31[0],zmm14[2],zmm31[2],zmm14[4],zmm31[4],zmm14[6],zmm31[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm4, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm22, %zmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm10[0],zmm0[0],zmm10[2],zmm0[2],zmm10[4],zmm0[4],zmm10[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm2, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k3} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm22 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm22 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k3} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm27[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4} ; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm29 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm11 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm14 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm3, %zmm4 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $6, %sil +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k4} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512DQ-SLOW-NEXT: movb $56, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm8 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,11,u,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm20, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm21, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %ymm21 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm21[0],mem[0],ymm21[2],mem[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm21[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm16 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm16 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm16, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX512DQ-SLOW-NEXT: movb $64, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} ; AVX512DQ-SLOW-NEXT: movb $8, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm10, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm16, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,12,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm13, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm19, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm13, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm7, %zmm12 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1536(%rax) ; AVX512DQ-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512DQ-FAST-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm8 ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm27, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm29, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm0, %ymm27, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm25[0],ymm20[2],ymm25[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm25, %ymm27, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $48, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm18 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm28 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm24, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm11[0],zmm17[0],zmm11[2],zmm17[2],zmm11[4],zmm17[4],zmm11[6],zmm17[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm23, %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm23, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-FAST-NEXT: movb $120, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm0 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm16 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm29 {%k3} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k3} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm1 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k5} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} ; AVX512DQ-FAST-NEXT: movb $-31, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm30 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm19 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm24 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} ; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm9, %zmm18 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm8, %zmm19 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm8, %zmm24 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm5, %zmm7 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm28 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm4 {%k4} -; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm6 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm10 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm12 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm12[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-FAST-NEXT: movb $64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} ; AVX512DQ-FAST-NEXT: movb $8, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) -; AVX512DQ-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 +; AVX512DQ-FAST-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -7534,443 +7526,445 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, (%rsp) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm17[0],ymm1[2],ymm17[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm20, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm16[0],ymm19[2],ymm16[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm16, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm16[0],ymm0[2],ymm16[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[2,3,2,3],zmm29[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm20, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm31, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm24, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm29, %zmm30, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm31, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm17, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm18, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm8[0],zmm10[0],zmm8[2],zmm10[2],zmm8[4],zmm10[4],zmm8[6],zmm10[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm8[0],zmm27[0],zmm8[2],zmm27[2],zmm8[4],zmm27[4],zmm8[6],zmm27[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm12[0],zmm9[0],zmm12[2],zmm9[2],zmm12[4],zmm9[4],zmm12[6],zmm9[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm11[0],zmm13[0],zmm11[2],zmm13[2],zmm11[4],zmm13[4],zmm11[6],zmm13[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm11[0],zmm25[0],zmm11[2],zmm25[2],zmm11[4],zmm25[4],zmm11[6],zmm25[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm14[0],zmm0[0],zmm14[2],zmm0[2],zmm14[4],zmm0[4],zmm14[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm19[0],zmm6[2],zmm19[2],zmm6[4],zmm19[4],zmm6[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm22[0],zmm0[0],zmm22[2],zmm0[2],zmm22[4],zmm0[4],zmm22[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm22, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm10, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm30 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm12 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm5 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm11 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm14[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = zmm16[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm12[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm26[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm10 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm6 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm22[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm18, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm16, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm16[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm16 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm21, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: movb $8, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm14, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm13, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm22, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm15, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm14, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq @@ -7979,440 +7973,439 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm28, %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm28, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm26[0],ymm20[2],ymm26[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm26, %ymm28, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm16, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm14[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm3[2,3,2,3],zmm12[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm1, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm20, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm25 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm19 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm28 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm15 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm9 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm10[0],mem[0],ymm10[2],mem[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm10[2,3,2,3],zmm8[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $8, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) ; AVX512BW-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq @@ -8421,876 +8414,879 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 ; AVX512DQBW-SLOW-NEXT: movb $96, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm16, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm16, (%rsp) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm16[0],ymm1[2],ymm16[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX512DQBW-SLOW-NEXT: movb $28, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm23 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,13,14,7,6,13,14,7] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm17[0],ymm19[2],ymm17[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k2} = zmm3[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm20, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[2,3,2,3],zmm20[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm3[2,3,2,3],zmm29[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm25, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm29, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm17, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $48, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm9[0],zmm12[0],zmm9[2],zmm12[2],zmm9[4],zmm12[4],zmm9[6],zmm12[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k3} = zmm11[0],zmm10[0],zmm11[2],zmm10[2],zmm11[4],zmm10[4],zmm11[6],zmm10[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm22, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm4, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm15[0],zmm10[2],zmm15[2],zmm10[4],zmm15[4],zmm10[6],zmm15[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm14, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm14[0],zmm31[0],zmm14[2],zmm31[2],zmm14[4],zmm31[4],zmm14[6],zmm31[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm4, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm22, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm10[0],zmm0[0],zmm10[2],zmm0[2],zmm10[4],zmm0[4],zmm10[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm8[0],zmm13[0],zmm8[2],zmm13[2],zmm8[4],zmm13[4],zmm8[6],zmm13[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm24[0],zmm31[0],zmm24[2],zmm31[2],zmm24[4],zmm31[4],zmm24[6],zmm31[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm2, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm10, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm22 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm22 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm18 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k3} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm27[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm5[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm2 = zmm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm25[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm29 {%k4} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm11 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm14 {%k4} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm10 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm24 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm2 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm29 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm11 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm3, %zmm4 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm14 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm5, %zmm10 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm5, %zmm24 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm13 {%k4} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $56, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm20, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm21, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %ymm21 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm21[0],mem[0],ymm21[2],mem[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm21[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm16 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm16 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm27[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm16, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %ymm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm16 = ymm16[0],mem[0],ymm16[2],mem[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k2} = zmm16[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX512DQBW-SLOW-NEXT: movb $64, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} ; AVX512DQBW-SLOW-NEXT: movb $8, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm16, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm13, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm19, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm13, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm7, %zmm12 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm1, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1536(%rax) ; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 +; AVX512DQBW-FAST-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm8 ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %ymm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm27, %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm29, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm0, %ymm27, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm25[0],ymm20[2],ymm25[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm25, %ymm27, %ymm20 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $48, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm18 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm28 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm24, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm11[0],zmm17[0],zmm11[2],zmm17[2],zmm11[4],zmm17[4],zmm11[6],zmm17[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm23, %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm23, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQBW-FAST-NEXT: movb $120, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm0 = zmm25[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm16 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm29 {%k3} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k3} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm1 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k5} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} ; AVX512DQBW-FAST-NEXT: movb $-31, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm30 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm18 {%k4} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm19 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm24 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} ; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm9, %zmm18 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm8, %zmm19 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm8, %zmm24 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm5 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm5, %zmm7 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm28 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm4 {%k4} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm6 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm10 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm10 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm12 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm12[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQBW-FAST-NEXT: movb $64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} ; AVX512DQBW-FAST-NEXT: movb $8, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm11, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) -; AVX512DQBW-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 +; AVX512DQBW-FAST-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -9985,38 +9981,38 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] ; SSE-NEXT: movapd 480(%rdx), %xmm11 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movapd 480(%rcx), %xmm7 -; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm7[0] +; SSE-NEXT: movapd 480(%rcx), %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; SSE-NEXT: movapd 480(%r8), %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movapd 480(%r9), %xmm5 -; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: movapd 496(%rdi), %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movapd 480(%r9), %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movapd 496(%rdi), %xmm5 ; SSE-NEXT: movapd 496(%rsi), %xmm4 -; SSE-NEXT: movapd %xmm6, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm4[0] ; SSE-NEXT: movapd 496(%rax), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm10[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm10[0],xmm5[1] ; SSE-NEXT: movapd 496(%rdx), %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movapd 496(%rcx), %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movapd 496(%r8), %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movapd 496(%rcx), %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd 496(%r8), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movapd 496(%r9), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 3568(%rax) -; SSE-NEXT: movapd %xmm1, 3552(%rax) +; SSE-NEXT: movapd %xmm2, 3552(%rax) ; SSE-NEXT: movapd %xmm4, 3536(%rax) -; SSE-NEXT: movapd %xmm6, 3520(%rax) -; SSE-NEXT: movapd %xmm2, 3504(%rax) +; SSE-NEXT: movapd %xmm5, 3520(%rax) +; SSE-NEXT: movapd %xmm1, 3504(%rax) ; SSE-NEXT: movapd %xmm3, 3488(%rax) -; SSE-NEXT: movapd %xmm8, 3472(%rax) -; SSE-NEXT: movapd %xmm5, 3456(%rax) -; SSE-NEXT: movapd %xmm7, 3440(%rax) +; SSE-NEXT: movapd %xmm7, 3472(%rax) +; SSE-NEXT: movapd %xmm6, 3456(%rax) +; SSE-NEXT: movapd %xmm8, 3440(%rax) ; SSE-NEXT: movapd %xmm12, 3424(%rax) ; SSE-NEXT: movapd %xmm15, 3408(%rax) ; SSE-NEXT: movapd %xmm9, 3392(%rax) @@ -10446,7 +10442,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3832, %rsp # imm = 0xEF8 +; AVX1-ONLY-NEXT: subq $3816, %rsp # imm = 0xEE8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10943,8 +10939,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rax), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rax), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 @@ -10957,38 +10953,38 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 400(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 400(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm5[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovapd 416(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm6[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovapd 416(%rax), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -11004,8 +11000,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rcx), %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 @@ -11207,17 +11203,17 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm11[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -11225,107 +11221,107 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm3, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 464(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0],ymm4[1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm11[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] @@ -11341,30 +11337,30 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 2704(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 2704(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2688(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 3152(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 3136(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 2256(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 3152(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 3136(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 2256(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 2240(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 1360(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm10, 912(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 896(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 896(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm14, 1808(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1792(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3520(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11573,13 +11569,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $3832, %rsp # imm = 0xEF8 +; AVX1-ONLY-NEXT: addq $3816, %rsp # imm = 0xEE8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3896, %rsp # imm = 0xF38 +; AVX2-ONLY-NEXT: subq $3880, %rsp # imm = 0xF28 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 @@ -11602,13 +11598,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -11647,11 +11643,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11687,11 +11683,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11727,11 +11723,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11767,11 +11763,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11807,11 +11803,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11847,11 +11843,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11887,11 +11883,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11927,11 +11923,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 256(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11967,11 +11963,11 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 288(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11998,18 +11994,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm10 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12028,9 +12024,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm7 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -12044,11 +12040,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12075,18 +12070,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12100,13 +12096,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm14 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -12138,9 +12134,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm11 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[0,1],ymm0[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] @@ -12165,9 +12161,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm12 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm0[0,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] @@ -12192,10 +12188,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd (%rsp), %ymm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12335,79 +12331,78 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm10, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm12[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm14 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm14 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm0 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm13 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm11, 3552(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 3520(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm12, 3552(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 3520(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 3488(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3456(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12415,18 +12410,18 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3392(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 3360(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 3328(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 3328(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3296(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 3264(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 3232(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 3232(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3200(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3168(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3136(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 3104(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 3104(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3072(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12437,8 +12432,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2976(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2944(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 2912(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 2880(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 2912(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 2880(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2848(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12447,10 +12442,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2784(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2752(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 2720(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 2720(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2688(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 2656(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 2656(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12461,8 +12456,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 2464(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 2432(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 2464(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 2432(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2400(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12471,7 +12466,7 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2336(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2304(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 2272(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 2272(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2240(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12614,749 +12609,769 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: addq $3896, %rsp # imm = 0xF38 +; AVX2-ONLY-NEXT: addq $3880, %rsp # imm = 0xF28 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $6408, %rsp # imm = 0x1908 +; AVX512F-ONLY-SLOW-NEXT: subq $6600, %rsp # imm = 0x19C8 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512F-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm22, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm19, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm25[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm1[0],ymm25[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k3 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm7, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm15[0],zmm12[0],zmm15[2],zmm12[2],zmm15[4],zmm12[4],zmm15[6],zmm12[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm8, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm21[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: movb $4, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k5} ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,9,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k5} ; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,4,8,u,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,10,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,9,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: movb $8, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k5} -; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} +; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k2} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -13364,50 +13379,51 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -13423,9 +13439,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -13433,22 +13450,22 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -13475,35 +13492,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2944(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2752(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2432(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) @@ -13513,9 +13530,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload @@ -13524,9 +13541,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13535,16 +13552,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3520(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3520(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13555,815 +13572,820 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $6408, %rsp # imm = 0x1908 +; AVX512F-ONLY-SLOW-NEXT: addq $6600, %rsp # imm = 0x19C8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 +; AVX512F-ONLY-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm27[0],ymm1[0],ymm27[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm6[0],ymm25[2],ymm6[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm18[0],ymm14[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm10[0],ymm19[2],ymm10[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm22, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm27, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm18, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm4[2,3,2,3],zmm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q (%rsp), %ymm2, %ymm27 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm23 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm18, %ymm2, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm10, %ymm2, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm3, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm17, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm29, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm29, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm17[0],zmm2[2],zmm17[2],zmm2[4],zmm17[4],zmm2[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm5, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm20, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm15 ; AVX512F-ONLY-FAST-NEXT: movb $4, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,10,u,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm14, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm9, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm14, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm26 ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $64, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k3} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,4,8,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm2, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,3,9,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm1, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k5} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 ; AVX512F-ONLY-FAST-NEXT: movb $8, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm29 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movb $112, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm21 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -14371,809 +14393,816 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm23[0,1,2,3],zmm26[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2944(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2880(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm10, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm10, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2432(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 2368(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm12, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2240(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2112(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm10, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 1472(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 1024(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3520(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 3328(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 +; AVX512F-ONLY-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $6536, %rsp # imm = 0x1988 +; AVX512DQ-SLOW-NEXT: subq $6472, %rsp # imm = 0x1948 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: movb $96, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512DQ-SLOW-NEXT: movb $28, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%r8), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r9), %ymm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm26[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r9), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm1[0],ymm28[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r9), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm1[2,3,2,3],zmm17[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm16 ; AVX512DQ-SLOW-NEXT: movb $48, %r10b ; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k3 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm25[0],zmm0[0],zmm25[2],zmm0[2],zmm25[4],zmm0[4],zmm25[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm31 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm24[0],zmm30[0],zmm24[2],zmm30[2],zmm24[4],zmm30[4],zmm24[6],zmm30[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm8[0],zmm12[2],zmm8[2],zmm12[4],zmm8[4],zmm12[6],zmm8[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm12[0],zmm13[0],zmm12[2],zmm13[2],zmm12[4],zmm13[4],zmm12[6],zmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm16 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm3[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 -; AVX512DQ-SLOW-NEXT: movb $4, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm15 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: movb $4, %sil +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k5} ; AVX512DQ-SLOW-NEXT: movb $6, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm1 {%k3} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,9,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k5} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k5} ; AVX512DQ-SLOW-NEXT: movb $64, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm17 ; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,4,8,u,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm8, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,10,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r9), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm6[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,9,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 ; AVX512DQ-SLOW-NEXT: movb $8, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k5} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k5} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k5} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k5} ; AVX512DQ-SLOW-NEXT: movb $-31, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -15182,81 +15211,83 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4} ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k4} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} ; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm15, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm1, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm12, %zmm6 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm28 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm9, %zmm22 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k3} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k3} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3} ; AVX512DQ-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k3} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3} ; AVX512DQ-SLOW-NEXT: movb $56, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k2} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} @@ -15267,43 +15298,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQ-SLOW-NEXT: movb $120, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm21 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm21 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 ; AVX512DQ-SLOW-NEXT: movb $-61, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7] @@ -15312,21 +15342,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm6 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm8 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] @@ -15334,44 +15364,43 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] @@ -15379,33 +15408,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 3008(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2944(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 2752(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 2560(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 2304(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1792(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1600(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) @@ -15416,8 +15445,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15426,9 +15455,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15437,16 +15466,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3520(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3520(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15457,10 +15486,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQ-SLOW-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512DQ-SLOW-NEXT: addq $6472, %rsp # imm = 0x1948 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -15468,797 +15497,804 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm27 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm29[0],ymm1[0],ymm29[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] ; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm8[0],ymm27[2],ymm8[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm28[0],ymm23[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm24[0],ymm13[2],ymm24[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%r9), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%r8), %ymm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%r9), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm12[0],ymm22[2],ymm12[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%r9), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm4[2,3,2,3],zmm10[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%r9), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vpermt2q (%rsp), %ymm2, %ymm29 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm27 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm28, %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm24, %ymm2, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm12, %ymm2, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm3, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 384(%r9), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm25, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $48, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm30, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm17, %zmm20 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm29 -; AVX512DQ-FAST-NEXT: movb $4, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,10,u,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm16, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm24 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm28 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm18, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm29 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm21 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm14 +; AVX512DQ-FAST-NEXT: movb $4, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm18 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm27 ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k4} ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 -; AVX512DQ-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} ; AVX512DQ-FAST-NEXT: movb $64, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm14 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm17 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,4,8,u,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,9,u,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm1, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} -; AVX512DQ-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5} +; AVX512DQ-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 ; AVX512DQ-FAST-NEXT: movb $8, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm26 {%k4} ; AVX512DQ-FAST-NEXT: movb $-31, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512DQ-FAST-NEXT: movb $112, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm24, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm21, %zmm31 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm9, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2} ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -16272,145 +16308,146 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQ-FAST-NEXT: movb $120, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: movb $-61, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm23[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 3008(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 2944(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2880(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm10, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm10, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2432(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm9, 2368(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2304(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 2112(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1856(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 3520(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 3328(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3136(%rax) ; AVX512DQ-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 @@ -16419,743 +16456,763 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $6408, %rsp # imm = 0x1908 +; AVX512BW-ONLY-SLOW-NEXT: subq $6600, %rsp # imm = 0x19C8 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm22, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm9[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm19, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r9), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm25[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm21[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm25[0],ymm1[0],ymm25[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r9), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm18, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r9), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm22, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%r8), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm30[0],ymm4[2],ymm30[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm31, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r10b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k3} = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm29[0],zmm31[0],zmm29[2],zmm31[2],zmm29[4],zmm31[4],zmm29[6],zmm31[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm27 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm8, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm21[0],zmm22[0],zmm21[2],zmm22[2],zmm21[4],zmm22[4],zmm21[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm7, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm15[0],zmm12[0],zmm15[2],zmm12[2],zmm15[4],zmm12[4],zmm15[6],zmm12[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm8, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm21[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm9[0],zmm0[0],zmm9[2],zmm0[2],zmm9[4],zmm0[4],zmm9[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: movb $4, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,10,u,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,9,u,u,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm1, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,4,8,u,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,2,3,4,8,u,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,10,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,10,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r9), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%r8), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm4[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,8,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,9,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,9,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movb $8, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 128(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 192(%rax), %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 256(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 320(%rax), %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 384(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -17163,50 +17220,51 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -17222,9 +17280,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm7 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -17232,22 +17291,22 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm28 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] @@ -17274,35 +17333,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm20 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3008(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2944(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2752(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2624(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2560(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2432(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2304(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1984(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1856(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1792(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1536(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 1472(%rax) @@ -17312,9 +17371,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 1024(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload @@ -17323,9 +17382,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm1, 576(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17334,16 +17393,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3520(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3520(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17354,815 +17413,820 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $6408, %rsp # imm = 0x1908 +; AVX512BW-ONLY-SLOW-NEXT: addq $6600, %rsp # imm = 0x19C8 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 +; AVX512BW-ONLY-FAST-NEXT: subq $6696, %rsp # imm = 0x1A28 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm27[0],ymm1[0],ymm27[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm6[0],ymm25[2],ymm6[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm26[0],ymm11[0],ymm26[2],ymm11[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r8), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm6[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm18[0],ymm14[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm10[0],ymm19[2],ymm10[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm22, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm27, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm18, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm4[2,3,2,3],zmm15[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%r9), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm9[0],ymm22[2],ymm9[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r9), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q (%rsp), %ymm2, %ymm27 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm25 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm23 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm18, %ymm2, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm10, %ymm2, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm3, %ymm2, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm26 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm25 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm17, %ymm0, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm0[0],zmm23[0],zmm0[2],zmm23[2],zmm0[4],zmm23[4],zmm0[6],zmm23[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm29, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm29, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm17[0],zmm2[2],zmm17[2],zmm2[4],zmm17[4],zmm2[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm30, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm20[0],zmm0[2],zmm20[2],zmm0[4],zmm20[4],zmm0[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm16[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm5, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm20, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: movb $4, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,10,u,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm14, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm9, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,10,u,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm14, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm26 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $64, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,4,8,u,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,2,3,4,8,u,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm2, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,1,2,3,9,u,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,9,u,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm27, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,9,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: movb $8, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm29 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm10 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm21, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 256(%rax), %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 320(%rax), %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 384(%rax), %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm21 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -18170,809 +18234,816 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm4[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm23[0,1,2,3],zmm26[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = zmm10[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm29[0,1,2,3],zmm30[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2944(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2880(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm10, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm10, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2560(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2432(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm9, 2368(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 2304(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm12, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2240(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2112(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm10, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm9, 1472(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 1024(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 576(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3520(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 3328(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 +; AVX512BW-ONLY-FAST-NEXT: addq $6696, %rsp # imm = 0x1A28 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $6536, %rsp # imm = 0x1988 +; AVX512DQBW-SLOW-NEXT: subq $6472, %rsp # imm = 0x1948 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: movb $96, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX512DQBW-SLOW-NEXT: movb $28, %r10b +; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r9), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r8), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r9), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm4[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] -; AVX512DQBW-SLOW-NEXT: movb $28, %r10b -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm6[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm22, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r9), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%r8), %ymm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm18, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r9), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%r8), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[2,3,2,3],zmm26[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm20, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%r9), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm1[0],ymm28[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r9), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm1[2,3,2,3],zmm17[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm27, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm16 ; AVX512DQBW-SLOW-NEXT: movb $48, %r10b ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k3 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm0[0],zmm3[0],zmm0[2],zmm3[2],zmm0[4],zmm3[4],zmm0[6],zmm3[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm25[0],zmm0[0],zmm25[2],zmm0[2],zmm25[4],zmm0[4],zmm25[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm29[0],zmm0[0],zmm29[2],zmm0[2],zmm29[4],zmm0[4],zmm29[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm22[0],zmm23[0],zmm22[2],zmm23[2],zmm22[4],zmm23[4],zmm22[6],zmm23[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm9, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm24[0],zmm30[0],zmm24[2],zmm30[2],zmm24[4],zmm30[4],zmm24[6],zmm30[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm8[0],zmm12[2],zmm8[2],zmm12[4],zmm8[4],zmm12[6],zmm8[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm28[0],zmm21[0],zmm28[2],zmm21[2],zmm28[4],zmm21[4],zmm28[6],zmm21[6] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm17, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm7[0],zmm10[0],zmm7[2],zmm10[2],zmm7[4],zmm10[4],zmm7[6],zmm10[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm22, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm12[0],zmm13[0],zmm12[2],zmm13[2],zmm12[4],zmm13[4],zmm12[6],zmm13[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm7, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k3} = zmm5[0],zmm0[0],zmm5[2],zmm0[2],zmm5[4],zmm0[4],zmm5[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm3[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm21, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm8, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm0[0],zmm6[2],zmm0[2],zmm6[4],zmm0[4],zmm6[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512DQBW-SLOW-NEXT: movb $4, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k3} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,10,u,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm15 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $6, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,9,u,u,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 {%k5} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 456(%rcx), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,2,9,u,u,6,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm6 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $64, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm1, %zmm17 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 448(%r8), %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,2,3,4,8,u,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,9,u,6,7> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm8, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,10,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r9), %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r8), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm6[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,9,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r9), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%r8), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm7[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm12, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm6 ; AVX512DQBW-SLOW-NEXT: movb $8, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k5} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k5} ; AVX512DQBW-SLOW-NEXT: movb $-31, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -18981,81 +19052,83 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm25 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k4} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm21 {%k4} ; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm15, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm19, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm1, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 128(%rax), %zmm10, %zmm4 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm12, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm28 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 192(%rax), %zmm1, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 256(%rax), %zmm1, %zmm30 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm9, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 320(%rax), %zmm1, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 384(%rax), %zmm3, %zmm21 {%k2} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 136(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm6 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 200(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 264(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm7 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 328(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm30 {%k3} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm26 {%k3} ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 392(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k3} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 {%k3} ; AVX512DQBW-SLOW-NEXT: movb $56, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} @@ -19066,43 +19139,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512DQBW-SLOW-NEXT: movb $120, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm21 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm21 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm23 ; AVX512DQBW-SLOW-NEXT: movb $-61, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm15 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # zmm11 = zmm1[0,1,2,3],mem[4,5,6,7] @@ -19111,21 +19183,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm6 = zmm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm5 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # zmm7 = zmm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm8 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] @@ -19133,44 +19205,43 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] @@ -19178,33 +19249,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 3008(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2944(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 2880(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 2752(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 2624(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 2560(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 2496(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2432(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 2304(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 2112(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2048(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1984(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1792(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1600(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1536(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) @@ -19215,8 +19286,8 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1088(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19225,9 +19296,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 768(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19236,16 +19307,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3520(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3520(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19256,10 +19327,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 3072(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512DQBW-SLOW-NEXT: addq $6472, %rsp # imm = 0x1948 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -19267,797 +19338,804 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: subq $6568, %rsp # imm = 0x19A8 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm8 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm27 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm29[0],ymm1[0],ymm29[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [6,13,14,7,6,13,14,7] ; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm8[0],ymm27[2],ymm8[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm30[0],ymm9[0],ymm30[2],ymm9[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm1[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm28[0],ymm23[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm24[0],ymm0[0],ymm24[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm2[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %ymm24 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm24[0],ymm13[2],ymm24[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm25[0],ymm11[2],ymm25[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm17, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r9), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r8), %ymm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm4[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%r9), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %ymm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm12[0],ymm22[2],ymm12[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm3[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm18, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm22, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r9), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r8), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm4[2,3,2,3],zmm10[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r9), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%r8), %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm10[2,3,2,3],zmm21[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm17, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vpermt2q (%rsp), %ymm2, %ymm29 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm27 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm28, %ymm2, %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm24, %ymm2, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm12, %ymm2, %ymm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm3, %ymm2, %ymm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r9), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r8), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm30 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm24 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm25, %ymm0, %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm9, %ymm0, %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm4, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r9), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%r8), %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $48, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm26[0],zmm0[2],zmm26[2],zmm0[4],zmm26[4],zmm0[6],zmm26[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm4[0],zmm2[0],zmm4[2],zmm2[2],zmm4[4],zmm2[4],zmm4[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm19[0],zmm2[2],zmm19[2],zmm2[4],zmm19[4],zmm2[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm18, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm15 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm3[0],zmm0[0],zmm3[2],zmm0[2],zmm3[4],zmm0[4],zmm3[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm0[0],zmm31[0],zmm0[2],zmm31[2],zmm0[4],zmm31[4],zmm0[6],zmm31[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k3} = zmm2[0],zmm16[0],zmm2[2],zmm16[2],zmm2[4],zmm16[4],zmm2[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm30, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm3[0],zmm19[0],zmm3[2],zmm19[2],zmm3[4],zmm19[4],zmm3[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm28 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k3} = zmm23[0],zmm20[0],zmm23[2],zmm20[2],zmm23[4],zmm20[4],zmm23[6],zmm20[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm18, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm18, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm17, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm18 {%k3} = zmm7[0],zmm5[0],zmm7[2],zmm5[2],zmm7[4],zmm5[4],zmm7[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm21, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm6[0],zmm5[0],zmm6[2],zmm5[2],zmm6[4],zmm5[4],zmm6[6],zmm5[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm6, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k3} = zmm12[0],zmm4[0],zmm12[2],zmm4[2],zmm12[4],zmm4[4],zmm12[6],zmm4[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm14 ; AVX512DQBW-FAST-NEXT: movb $4, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,2,10,u,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm16, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm24 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,10,u,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,12,u,3,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm10, %zmm18 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm23 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm27 ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k4} ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 456(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k5} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,2,9,u,u,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm17, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm12 {%k4} ; AVX512DQBW-FAST-NEXT: movb $64, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm14 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,4,8,u,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,9,u,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm1, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k5} -; AVX512DQBW-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,10,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,12,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 448(%r8), %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,4,8,u,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm31 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 264(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 328(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k5} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 392(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,10,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k2} = zmm17[2,3,2,3],zmm1[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm12 ; AVX512DQBW-FAST-NEXT: movb $8, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm26 {%k4} ; AVX512DQBW-FAST-NEXT: movb $-31, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm10 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm31 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm22 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm7 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm11 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512DQBW-FAST-NEXT: movb $112, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm2, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm2, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm0, %zmm13 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm24, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm21, %zmm31 {%k2} -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm9, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm0, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm0, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 256(%rax), %zmm23, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 320(%rax), %zmm27, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm27 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 384(%rax), %zmm10, %zmm25 {%k2} ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm26 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm20 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm23 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm16 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k2} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -20071,145 +20149,146 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQBW-FAST-NEXT: movb $120, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: movb $-61, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm9 = zmm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm23[0,1,2,3],zmm26[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm5 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm7 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm8 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm9 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm11 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 3008(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 2944(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2880(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm10, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 2752(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm10, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 2560(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2496(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2432(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm9, 2368(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2304(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2240(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 2112(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 2048(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1856(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3520(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 3520(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3456(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 3328(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3328(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3264(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3072(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3136(%rax) ; AVX512DQBW-FAST-NEXT: addq $6568, %rsp # imm = 0x19A8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index efb3a98dc68d7..af03a07a592e6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -154,7 +154,7 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm11 +; SSE-NEXT: movaps 16(%rsi), %xmm14 ; SSE-NEXT: movaps (%rdx), %xmm1 ; SSE-NEXT: movaps 16(%rdx), %xmm3 ; SSE-NEXT: movaps (%rcx), %xmm7 @@ -164,8 +164,8 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps (%r9), %xmm13 ; SSE-NEXT: movaps (%r10), %xmm6 ; SSE-NEXT: movaps 16(%r10), %xmm9 -; SSE-NEXT: movaps (%rax), %xmm14 -; SSE-NEXT: movaps 16(%rax), %xmm15 +; SSE-NEXT: movaps (%rax), %xmm15 +; SSE-NEXT: movaps 16(%rax), %xmm11 ; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -177,21 +177,21 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] ; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps 16(%r9), %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; SSE-NEXT: movaps 16(%r9), %xmm11 ; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps %xmm9, 240(%rax) @@ -199,8 +199,8 @@ define void @store_i64_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm13, 176(%rax) ; SSE-NEXT: movaps %xmm5, 96(%rax) ; SSE-NEXT: movaps %xmm6, 112(%rax) -; SSE-NEXT: movaps %xmm14, 32(%rax) -; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm15, 32(%rax) +; SSE-NEXT: movaps %xmm14, 48(%rax) ; SSE-NEXT: movaps %xmm2, 192(%rax) ; SSE-NEXT: movaps %xmm3, 208(%rax) ; SSE-NEXT: movaps %xmm12, 128(%rax) @@ -384,56 +384,56 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm3 -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movaps (%rsi), %xmm1 -; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movaps (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm7 -; SSE-NEXT: movaps (%rcx), %xmm2 -; SSE-NEXT: movaps 16(%rcx), %xmm14 -; SSE-NEXT: movaps (%r8), %xmm6 -; SSE-NEXT: movaps 16(%r8), %xmm9 -; SSE-NEXT: movaps (%r9), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm0 -; SSE-NEXT: movaps (%r10), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm9 +; SSE-NEXT: movaps (%rsi), %xmm3 +; SSE-NEXT: movaps 16(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movaps 16(%rdx), %xmm11 +; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movaps 16(%rcx), %xmm1 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps 16(%r8), %xmm13 +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps 16(%r9), %xmm2 +; SSE-NEXT: movaps (%r10), %xmm12 ; SSE-NEXT: movaps 16(%r10), %xmm15 -; SSE-NEXT: movaps (%rax), %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps (%rax), %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm3[0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm12[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm14[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rax), %xmm0 ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -441,61 +441,61 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm13 -; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: movaps 32(%rdx), %xmm14 +; SSE-NEXT: movaps 32(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdx), %xmm11 ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: movaps %xmm11, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 32(%r8), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 32(%r8), %xmm10 ; SSE-NEXT: movaps 32(%r9), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 32(%r10), %xmm10 -; SSE-NEXT: movaps 32(%rax), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 32(%r10), %xmm8 +; SSE-NEXT: movaps 32(%rax), %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps 48(%rdi), %xmm6 -; SSE-NEXT: movaps 48(%rsi), %xmm2 +; SSE-NEXT: movaps 48(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: movaps 48(%rdx), %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 48(%rdx), %xmm5 ; SSE-NEXT: movaps 48(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 48(%r8), %xmm1 -; SSE-NEXT: movaps 48(%r9), %xmm3 +; SSE-NEXT: movaps 48(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 48(%r10), %xmm3 -; SSE-NEXT: movaps 48(%rax), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 48(%r10), %xmm2 +; SSE-NEXT: movaps 48(%rax), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 496(%rax) +; SSE-NEXT: movaps %xmm2, 496(%rax) ; SSE-NEXT: movaps %xmm1, 480(%rax) -; SSE-NEXT: movaps %xmm2, 464(%rax) +; SSE-NEXT: movaps %xmm5, 464(%rax) ; SSE-NEXT: movaps %xmm6, 448(%rax) ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps %xmm4, 416(%rax) ; SSE-NEXT: movaps %xmm7, 400(%rax) ; SSE-NEXT: movaps %xmm9, 384(%rax) -; SSE-NEXT: movaps %xmm10, 368(%rax) -; SSE-NEXT: movaps %xmm8, 352(%rax) -; SSE-NEXT: movaps %xmm14, 336(%rax) +; SSE-NEXT: movaps %xmm8, 368(%rax) +; SSE-NEXT: movaps %xmm10, 352(%rax) +; SSE-NEXT: movaps %xmm11, 336(%rax) ; SSE-NEXT: movaps %xmm13, 320(%rax) -; SSE-NEXT: movaps %xmm11, 304(%rax) -; SSE-NEXT: movaps %xmm12, 288(%rax) +; SSE-NEXT: movaps %xmm12, 304(%rax) +; SSE-NEXT: movaps %xmm14, 288(%rax) ; SSE-NEXT: movaps %xmm15, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) @@ -611,36 +611,36 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm1[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm15[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 416(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) @@ -665,102 +665,102 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm7 ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm15[0],xmm14[0] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm12, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm5, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm11[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm15 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm13 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm8 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rdx) @@ -1194,74 +1194,74 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm15 +; SSE-NEXT: movaps 80(%r10), %xmm2 ; SSE-NEXT: movaps 80(%rax), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm11 -; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdx), %xmm13 -; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 96(%r8), %xmm8 +; SSE-NEXT: movaps 96(%rdx), %xmm10 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 96(%r8), %xmm11 ; SSE-NEXT: movaps 96(%r9), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 96(%r10), %xmm9 ; SSE-NEXT: movaps 96(%rax), %xmm0 ; SSE-NEXT: movaps %xmm9, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps 112(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: movaps 112(%rdx), %xmm2 +; SSE-NEXT: movaps 112(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdx), %xmm5 ; SSE-NEXT: movaps 112(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 112(%r8), %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm3 +; SSE-NEXT: movaps 112(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 112(%r10), %xmm3 -; SSE-NEXT: movaps 112(%rax), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 112(%r10), %xmm2 +; SSE-NEXT: movaps 112(%rax), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1008(%rax) +; SSE-NEXT: movaps %xmm2, 1008(%rax) ; SSE-NEXT: movaps %xmm1, 992(%rax) -; SSE-NEXT: movaps %xmm2, 976(%rax) -; SSE-NEXT: movaps %xmm6, 960(%rax) +; SSE-NEXT: movaps %xmm5, 976(%rax) +; SSE-NEXT: movaps %xmm7, 960(%rax) ; SSE-NEXT: movaps %xmm0, 944(%rax) ; SSE-NEXT: movaps %xmm4, 928(%rax) -; SSE-NEXT: movaps %xmm7, 912(%rax) -; SSE-NEXT: movaps %xmm10, 896(%rax) +; SSE-NEXT: movaps %xmm6, 912(%rax) +; SSE-NEXT: movaps %xmm8, 896(%rax) ; SSE-NEXT: movaps %xmm9, 880(%rax) -; SSE-NEXT: movaps %xmm8, 864(%rax) -; SSE-NEXT: movaps %xmm13, 848(%rax) -; SSE-NEXT: movaps %xmm11, 832(%rax) +; SSE-NEXT: movaps %xmm11, 864(%rax) +; SSE-NEXT: movaps %xmm10, 848(%rax) +; SSE-NEXT: movaps %xmm13, 832(%rax) ; SSE-NEXT: movaps %xmm12, 816(%rax) ; SSE-NEXT: movaps %xmm14, 800(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 784(%rax) +; SSE-NEXT: movaps %xmm15, 784(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) -; SSE-NEXT: movaps %xmm15, 752(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 736(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1492,50 +1492,50 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] @@ -1545,17 +1545,17 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rax), %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] @@ -1574,8 +1574,8 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 704(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 672(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 640(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 672(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 640(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1584,9 +1584,9 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 480(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1597,8 +1597,8 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1645,10 +1645,10 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -1662,22 +1662,22 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm6 @@ -1699,18 +1699,18 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm15, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm15[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm13, %ymm13 ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] @@ -1726,15 +1726,15 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm11 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm7, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] @@ -1756,98 +1756,98 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm7 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm8 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rax), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm9 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rax), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm11 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 960(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 928(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 960(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 928(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 896(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 736(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 736(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 640(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 672(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 640(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 416(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2712,74 +2712,74 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r10), %xmm15 +; SSE-NEXT: movaps 208(%r10), %xmm2 ; SSE-NEXT: movaps 208(%rax), %xmm0 -; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 224(%rdx), %xmm13 -; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 224(%r8), %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdx), %xmm10 +; SSE-NEXT: movaps 224(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 224(%r8), %xmm12 ; SSE-NEXT: movaps 224(%r9), %xmm0 -; SSE-NEXT: movaps %xmm8, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movaps 224(%r10), %xmm9 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 224(%r10), %xmm8 ; SSE-NEXT: movaps 224(%rax), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 240(%rdi), %xmm6 -; SSE-NEXT: movaps 240(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: movaps 240(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdx), %xmm6 ; SSE-NEXT: movaps 240(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps 240(%r8), %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm3 +; SSE-NEXT: movaps 240(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 240(%r10), %xmm3 -; SSE-NEXT: movaps 240(%rax), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps 240(%r10), %xmm2 +; SSE-NEXT: movaps 240(%rax), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 2032(%rax) +; SSE-NEXT: movaps %xmm2, 2032(%rax) ; SSE-NEXT: movaps %xmm1, 2016(%rax) -; SSE-NEXT: movaps %xmm2, 2000(%rax) -; SSE-NEXT: movaps %xmm6, 1984(%rax) +; SSE-NEXT: movaps %xmm6, 2000(%rax) +; SSE-NEXT: movaps %xmm5, 1984(%rax) ; SSE-NEXT: movaps %xmm0, 1968(%rax) ; SSE-NEXT: movaps %xmm4, 1952(%rax) ; SSE-NEXT: movaps %xmm7, 1936(%rax) -; SSE-NEXT: movaps %xmm10, 1920(%rax) -; SSE-NEXT: movaps %xmm9, 1904(%rax) -; SSE-NEXT: movaps %xmm8, 1888(%rax) -; SSE-NEXT: movaps %xmm13, 1872(%rax) -; SSE-NEXT: movaps %xmm11, 1856(%rax) -; SSE-NEXT: movaps %xmm12, 1840(%rax) -; SSE-NEXT: movaps %xmm14, 1824(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1808(%rax) +; SSE-NEXT: movaps %xmm9, 1920(%rax) +; SSE-NEXT: movaps %xmm8, 1904(%rax) +; SSE-NEXT: movaps %xmm12, 1888(%rax) +; SSE-NEXT: movaps %xmm10, 1872(%rax) +; SSE-NEXT: movaps %xmm14, 1856(%rax) +; SSE-NEXT: movaps %xmm11, 1840(%rax) +; SSE-NEXT: movaps %xmm13, 1824(%rax) +; SSE-NEXT: movaps %xmm15, 1808(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1792(%rax) -; SSE-NEXT: movaps %xmm15, 1776(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1776(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1760(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3007,7 +3007,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX1-ONLY-LABEL: store_i64_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1704, %rsp # imm = 0x6A8 +; AVX1-ONLY-NEXT: subq $1672, %rsp # imm = 0x688 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 @@ -3250,7 +3250,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3270,11 +3270,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -3313,8 +3313,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3358,7 +3357,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3371,67 +3370,66 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 208(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 208(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 208(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%r9), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 240(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2016(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1984(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 1952(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 1984(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 1952(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 1920(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%rdx) @@ -3441,9 +3439,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1824(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1792(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1760(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 1728(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 1696(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1760(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1728(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1696(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3454,8 +3453,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1568(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1536(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 1504(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 1472(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 1504(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 1472(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1440(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3468,7 +3467,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1312(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 1248(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 1248(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1216(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%rdx) @@ -3482,8 +3481,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 992(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 960(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 992(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 960(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3510,9 +3509,8 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 480(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 448(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3525,7 +3523,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) @@ -3541,7 +3539,7 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: addq $1704, %rsp # imm = 0x6A8 +; AVX1-ONLY-NEXT: addq $1672, %rsp # imm = 0x688 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3598,11 +3596,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 @@ -3610,11 +3608,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm1 @@ -3847,10 +3845,10 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -3867,10 +3865,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -3910,11 +3909,11 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -3937,76 +3936,74 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 152(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rax), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%rax), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm4 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rax), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r10), %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4014,54 +4011,55 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 2016(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1984(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1952(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 1920(%rdx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm2, 2016(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1984(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 1952(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1920(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 1760(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 1504(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1472(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 1728(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1696(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1664(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 1472(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1248(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1248(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 1216(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 992(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 960(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 992(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 960(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 736(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 736(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 704(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4142,3901 +4140,975 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf32: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq +; AVX512F-LABEL: store_i64_stride8_vf32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm24 +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512F-NEXT: movb $-64, %r11b +; AVX512F-NEXT: kmovw %r11d, %k1 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] +; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm30 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm21 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm14 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512F-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 +; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX512F-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512F-NEXT: vmovdqa64 (%rdx), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] +; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm20 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm12 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm16 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm20 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm13 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] +; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm20 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm9 +; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512F-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1984(%rax) +; AVX512F-NEXT: vmovaps %zmm10, 1920(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1536(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 1472(%rax) +; AVX512F-NEXT: vmovaps %zmm19, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 1280(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf32: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b -; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf32: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: movb $-64, %r11b -; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: store_i64_stride8_vf32: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512DQ-FAST-NEXT: movb $-64, %r11b -; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq -; -; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf32: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf32: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf32: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b -; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq -; -; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf32: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovaps 192(%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 -; AVX512DQBW-FAST-NEXT: movb $-64, %r11b -; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm30[0],zmm20[2],zmm30[2],zmm20[4],zmm30[4],zmm20[6],zmm30[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm30[1],zmm20[3],zmm30[3],zmm20[5],zmm30[5],zmm20[7],zmm30[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm17[0],zmm21[0],zmm17[2],zmm21[2],zmm17[4],zmm21[4],zmm17[6],zmm21[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm17[1],zmm21[1],zmm17[3],zmm21[3],zmm17[5],zmm21[5],zmm17[7],zmm21[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm15[0],zmm19[2],zmm15[2],zmm19[4],zmm15[4],zmm19[6],zmm15[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm15[1],zmm19[3],zmm15[3],zmm19[5],zmm15[5],zmm19[7],zmm15[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm18[0],zmm22[0],zmm18[2],zmm22[2],zmm18[4],zmm22[4],zmm18[6],zmm22[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm18[1],zmm22[1],zmm18[3],zmm22[3],zmm18[5],zmm22[5],zmm18[7],zmm22[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm27 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm16[0],zmm11[0],zmm16[2],zmm11[2],zmm16[4],zmm11[4],zmm16[6],zmm11[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm16[1],zmm11[1],zmm16[3],zmm11[3],zmm16[5],zmm11[5],zmm16[7],zmm11[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm23[0],zmm5[0],zmm23[2],zmm5[2],zmm23[4],zmm5[4],zmm23[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm23[1],zmm5[1],zmm23[3],zmm5[3],zmm23[5],zmm5[5],zmm23[7],zmm5[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm25, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm2, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 64(%rdx), %ymm17, %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm17[0],ymm2[0],ymm17[2],ymm2[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm17[1],ymm2[1],ymm17[3],ymm2[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm7, %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm14, %ymm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm17 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm10, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm12, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm24, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm11, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm17 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm17, %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm17[0],ymm11[0],ymm17[2],ymm11[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm17[1],ymm11[1],ymm17[3],ymm11[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm5, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm17[0],ymm18[2],ymm17[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm19[0],ymm20[2],ymm19[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm18[1],ymm17[1],ymm18[3],ymm17[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm20[1],ymm19[1],ymm20[3],ymm19[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm17[0],ymm6[0],ymm17[2],ymm6[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm17[1],ymm6[1],ymm17[3],ymm6[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm14[2,3],ymm6[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm14[0],ymm17[2],ymm14[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm14[1],ymm17[3],ymm14[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm26, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm31, %zmm10 -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovaps %zmm11, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-FAST-NEXT: addq $2568, %rsp # imm = 0xA08 -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq +; AVX512BW-LABEL: store_i64_stride8_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512BW-NEXT: movb $-64, %r11b +; AVX512BW-NEXT: kmovd %r11d, %k1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] +; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm30 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm21 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm14 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm0 +; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm12 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm16 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm13 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm9 +; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1984(%rax) +; AVX512BW-NEXT: vmovaps %zmm10, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 1472(%rax) +; AVX512BW-NEXT: vmovaps %zmm19, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 1280(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64 @@ -8896,74 +5968,74 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%r10), %xmm15 -; SSE-NEXT: movaps 464(%rax), %xmm1 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] -; SSE-NEXT: movaps 480(%rdi), %xmm11 -; SSE-NEXT: movaps 480(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 480(%rdx), %xmm13 -; SSE-NEXT: movaps 480(%rcx), %xmm0 +; SSE-NEXT: movaps 464(%r10), %xmm2 +; SSE-NEXT: movaps 464(%rax), %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 480(%rdi), %xmm13 +; SSE-NEXT: movaps 480(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 480(%r8), %xmm10 +; SSE-NEXT: movaps 480(%rdx), %xmm10 +; SSE-NEXT: movaps 480(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movaps 480(%r8), %xmm9 ; SSE-NEXT: movaps 480(%r9), %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 480(%r10), %xmm9 -; SSE-NEXT: movaps 480(%rax), %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm6 -; SSE-NEXT: movaps 496(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-NEXT: movaps 496(%rdx), %xmm2 +; SSE-NEXT: movaps 480(%r10), %xmm11 +; SSE-NEXT: movaps 480(%rax), %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] +; SSE-NEXT: movaps 496(%rdi), %xmm7 +; SSE-NEXT: movaps 496(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 496(%rdx), %xmm5 ; SSE-NEXT: movaps 496(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 496(%r8), %xmm1 -; SSE-NEXT: movaps 496(%r9), %xmm3 +; SSE-NEXT: movaps 496(%r9), %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movaps 496(%r10), %xmm3 -; SSE-NEXT: movaps 496(%rax), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; SSE-NEXT: movaps %xmm3, 4080(%rcx) +; SSE-NEXT: movaps 496(%r10), %xmm2 +; SSE-NEXT: movaps 496(%rax), %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, 4080(%rcx) ; SSE-NEXT: movaps %xmm1, 4064(%rcx) -; SSE-NEXT: movaps %xmm2, 4048(%rcx) -; SSE-NEXT: movaps %xmm6, 4032(%rcx) +; SSE-NEXT: movaps %xmm5, 4048(%rcx) +; SSE-NEXT: movaps %xmm7, 4032(%rcx) ; SSE-NEXT: movaps %xmm0, 4016(%rcx) ; SSE-NEXT: movaps %xmm4, 4000(%rcx) -; SSE-NEXT: movaps %xmm7, 3984(%rcx) +; SSE-NEXT: movaps %xmm6, 3984(%rcx) ; SSE-NEXT: movaps %xmm8, 3968(%rcx) -; SSE-NEXT: movaps %xmm9, 3952(%rcx) -; SSE-NEXT: movaps %xmm10, 3936(%rcx) -; SSE-NEXT: movaps %xmm13, 3920(%rcx) -; SSE-NEXT: movaps %xmm11, 3904(%rcx) +; SSE-NEXT: movaps %xmm11, 3952(%rcx) +; SSE-NEXT: movaps %xmm9, 3936(%rcx) +; SSE-NEXT: movaps %xmm10, 3920(%rcx) +; SSE-NEXT: movaps %xmm13, 3904(%rcx) ; SSE-NEXT: movaps %xmm12, 3888(%rcx) ; SSE-NEXT: movaps %xmm14, 3872(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 3856(%rcx) +; SSE-NEXT: movaps %xmm15, 3856(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 3840(%rcx) -; SSE-NEXT: movaps %xmm15, 3824(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 3824(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 3808(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -9478,58 +6550,58 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -9914,10 +6986,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -9934,11 +7006,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -9955,12 +7026,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -10219,43 +7288,40 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%r10), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 496(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 496(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 496(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 496(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 496(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 496(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -10294,17 +7360,17 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10313,12 +7379,12 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm3, 4064(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 4032(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 4032(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 4000(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 3968(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10330,7 +7396,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 3840(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 3808(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 3776(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 3776(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 3744(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10371,8 +7437,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3104(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3072(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 3040(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 3008(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 3040(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 3008(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2976(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10399,8 +7465,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2592(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2560(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 2528(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 2496(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 2528(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 2496(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10563,7 +7629,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX2-ONLY-LABEL: store_i64_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3912, %rsp # imm = 0xF48 +; AVX2-ONLY-NEXT: subq $3880, %rsp # imm = 0xF28 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 @@ -10614,11 +7680,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 @@ -10626,11 +7692,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm1 @@ -10698,11 +7764,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm1 @@ -10746,11 +7812,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 232(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm1 @@ -10783,9 +7849,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -10842,11 +7908,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 352(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 360(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 352(%rax), %xmm1 @@ -10866,11 +7932,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 384(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 392(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm1 @@ -10885,15 +7951,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm12 @@ -10905,13 +7971,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vbroadcastsd 456(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps 448(%r9), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vbroadcastsd 456(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 448(%rax), %xmm6 @@ -10923,13 +7989,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm5[1] ; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%r9), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 480(%r9), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vbroadcastsd 488(%r10), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 480(%rax), %xmm0 @@ -11062,8 +8128,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 288(%rdx), %ymm14, %ymm14 ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload @@ -11122,10 +8188,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 @@ -11134,20 +8199,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdx), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm7, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm9[0] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r10), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdx), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm3[0] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r10), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -11168,10 +8233,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -11188,10 +8253,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -11208,11 +8273,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -11229,8 +8293,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -11278,7 +8341,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -11296,7 +8359,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%r10), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -11473,43 +8536,43 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vbroadcastsd 472(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%r10), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 480(%r9), %ymm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%r9), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%r10), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] @@ -11542,13 +8605,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11569,12 +8630,11 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm9, 4064(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 4032(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm9, 4000(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 4032(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 4000(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm9, 3968(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3808(%rdx) @@ -11607,14 +8667,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2720(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2688(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 2528(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 2496(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 2528(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 2496(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2464(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2432(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 2272(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 2240(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2240(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11629,13 +8690,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1760(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%rdx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1472(%rdx) @@ -11811,657 +8872,643 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: addq $3912, %rsp # imm = 0xF48 +; AVX2-ONLY-NEXT: addq $3880, %rsp # imm = 0xF28 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512F-ONLY-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -12469,261 +9516,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) @@ -12751,8 +9798,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12761,8 +9808,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12772,7 +9819,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12781,8 +9828,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12791,8 +9838,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12801,8 +9848,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12812,658 +9859,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512F-ONLY-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -13471,261 +10504,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 704(%rax) @@ -13753,8 +10786,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13763,8 +10796,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13774,7 +10807,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13783,8 +10816,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13793,8 +10826,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13803,8 +10836,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -13814,658 +10847,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512DQ-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQ-SLOW-NEXT: movb $-64, %r11b ; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -14473,261 +11492,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 704(%rax) @@ -14755,8 +11774,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14765,8 +11784,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14776,7 +11795,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14785,8 +11804,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14795,8 +11814,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14805,8 +11824,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14816,658 +11835,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride8_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512DQ-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQ-FAST-NEXT: movb $-64, %r11b ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -15475,261 +12480,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 704(%rax) @@ -15757,8 +12762,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15767,8 +12772,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15778,7 +12783,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15787,8 +12792,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15797,8 +12802,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15807,8 +12812,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15818,658 +12823,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512BW-ONLY-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -16477,261 +13468,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) @@ -16759,8 +13750,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16769,8 +13760,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16780,7 +13771,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16789,8 +13780,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16799,8 +13790,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16809,8 +13800,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16820,658 +13811,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512BW-ONLY-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -17479,261 +14456,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 704(%rax) @@ -17761,8 +14738,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17771,8 +14748,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17782,7 +14759,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17791,8 +14768,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17801,8 +14778,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17811,8 +14788,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17822,658 +14799,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512DQBW-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b ; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -18481,261 +15444,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 704(%rax) @@ -18763,8 +15726,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18773,8 +15736,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18784,7 +15747,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18793,8 +15756,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18803,8 +15766,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18813,8 +15776,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18824,658 +15787,644 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $5576, %rsp # imm = 0x15C8 +; AVX512DQBW-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 ; AVX512DQBW-FAST-NEXT: movb $-64, %r11b ; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm14[0],zmm1[0],zmm14[2],zmm1[2],zmm14[4],zmm1[4],zmm14[6],zmm1[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm14[1],zmm1[1],zmm14[3],zmm1[3],zmm14[5],zmm1[5],zmm14[7],zmm1[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512DQBW-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm20[0],zmm26[0],zmm20[2],zmm26[2],zmm20[4],zmm26[4],zmm20[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm15, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm20[1],zmm26[1],zmm20[3],zmm26[3],zmm20[5],zmm26[5],zmm20[7],zmm26[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm19[0],zmm21[0],zmm19[2],zmm21[2],zmm19[4],zmm21[4],zmm19[6],zmm21[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm19[1],zmm21[1],zmm19[3],zmm21[3],zmm19[5],zmm21[5],zmm19[7],zmm21[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm24[0],zmm28[2],zmm24[2],zmm28[4],zmm24[4],zmm28[6],zmm24[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm28[1],zmm10[1],zmm28[3],zmm10[3],zmm28[5],zmm10[5],zmm28[7],zmm10[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm18[0],zmm16[2],zmm18[2],zmm16[4],zmm18[4],zmm16[6],zmm18[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm18[1],zmm16[3],zmm18[3],zmm16[5],zmm18[5],zmm16[7],zmm18[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm2[0],zmm24[0],zmm2[2],zmm24[2],zmm2[4],zmm24[4],zmm2[6],zmm24[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm24[1],zmm3[3],zmm24[3],zmm3[5],zmm24[5],zmm3[7],zmm24[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm5[0],zmm2[2],zmm5[2],zmm2[4],zmm5[4],zmm2[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm16[1],zmm9[3],zmm16[3],zmm9[5],zmm16[5],zmm9[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm17[0],zmm3[2],zmm17[2],zmm3[4],zmm17[4],zmm3[6],zmm17[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm17[1],zmm3[3],zmm17[3],zmm3[5],zmm17[5],zmm3[7],zmm17[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm3[0],zmm25[0],zmm3[2],zmm25[2],zmm3[4],zmm25[4],zmm3[6],zmm25[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm3[1],zmm25[1],zmm3[3],zmm25[3],zmm3[5],zmm25[5],zmm3[7],zmm25[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm16 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm16[0],zmm22[2],zmm16[2],zmm22[4],zmm16[4],zmm22[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm22[1],zmm16[1],zmm22[3],zmm16[3],zmm22[5],zmm16[5],zmm22[7],zmm16[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm23[0],zmm9[0],zmm23[2],zmm9[2],zmm23[4],zmm9[4],zmm23[6],zmm9[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm9[1],zmm23[3],zmm9[3],zmm23[5],zmm9[5],zmm23[7],zmm9[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] ; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm14, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm10[0],zmm27[2],zmm10[2],zmm27[4],zmm10[4],zmm27[6],zmm10[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm10[1],zmm27[3],zmm10[3],zmm27[5],zmm10[5],zmm27[7],zmm10[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm11[0],zmm30[2],zmm11[2],zmm30[4],zmm11[4],zmm30[6],zmm11[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm30[1],zmm11[1],zmm30[3],zmm11[3],zmm30[5],zmm11[5],zmm30[7],zmm11[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm19[0],zmm12[0],zmm19[2],zmm12[2],zmm19[4],zmm12[4],zmm19[6],zmm12[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm12[1],zmm19[3],zmm12[3],zmm19[5],zmm12[5],zmm19[7],zmm12[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm19 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -19483,261 +16432,261 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm9 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm18, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm7 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm1, %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm12, %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm13, %ymm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm19, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm13, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 128(%rdx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm23, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 192(%rcx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 192(%rdx), %ymm23, %ymm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm23[0],ymm18[0],ymm23[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm23[1],ymm18[1],ymm23[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm31, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %xmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 256(%rcx), %ymm23, %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm23[0],ymm30[2],ymm23[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm16, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm23 = ymm30[1],ymm23[1],ymm30[3],ymm23[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm23, %zmm16, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %xmm28 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 320(%rcx), %ymm28, %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm24, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm25, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm28, %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm17, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm20, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm28, %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm31 = ymm30[0],ymm28[0],ymm30[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm31, %zmm6, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm28 = ymm30[1],ymm28[1],ymm30[3],ymm28[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 3776(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 3712(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 3264(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2240(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1664(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1152(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 704(%rax) @@ -19765,8 +16714,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3392(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3008(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19775,8 +16724,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2880(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2496(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19786,7 +16735,7 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2304(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2048(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1984(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19795,8 +16744,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1856(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1472(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19805,8 +16754,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19815,8 +16764,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -19826,8 +16775,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-FAST-NEXT: addq $5576, %rsp # imm = 0x15C8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQBW-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 729f2eb4b7997..06e94ccb92e69 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -595,7 +595,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 32(%rdi), %xmm12 ; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa 32(%rsi), %xmm15 +; SSE-NEXT: movdqa 32(%rsi), %xmm14 ; SSE-NEXT: movdqa 48(%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rdx), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -626,7 +626,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] @@ -674,29 +674,29 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm10 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; SSE-NEXT: movdqa %xmm15, %xmm8 ; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,1,2] @@ -715,8 +715,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm4 @@ -730,12 +730,12 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 @@ -763,13 +763,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,1,2] @@ -781,7 +781,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,6,6] ; SSE-NEXT: movdqa %xmm1, %xmm7 @@ -800,13 +800,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; SSE-NEXT: pandn %xmm9, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] @@ -817,13 +817,13 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm14, 32(%rcx) +; SSE-NEXT: movdqa %xmm15, 32(%rcx) ; SSE-NEXT: movdqa %xmm7, 48(%rcx) ; SSE-NEXT: movdqa %xmm12, 80(%rcx) ; SSE-NEXT: movdqa %xmm5, 96(%rcx) @@ -840,119 +840,119 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: subq $24, %rsp ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 80(%rcx) +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 176(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm13, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 128(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 144(%rcx) -; AVX1-ONLY-NEXT: popq %rax +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 128(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 144(%rcx) +; AVX1-ONLY-NEXT: addq $24, %rsp ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll index 65028fbbb13f1..ea7a3a0920c24 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -517,70 +517,70 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm6[8],xmm13[9],xmm6[9],xmm13[10],xmm6[10],xmm13[11],xmm6[11],xmm13[12],xmm6[12],xmm13[13],xmm6[13],xmm13[14],xmm6[14],xmm13[15],xmm6[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 19dd55874c471..04b9dcb418cea 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -561,39 +561,38 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm8 +; SSE-NEXT: movdqa (%rcx), %xmm4 ; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] ; SSE-NEXT: movdqa %xmm10, %xmm12 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,1,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,1,0] -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 ; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm11, %xmm10 ; SSE-NEXT: por %xmm5, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] @@ -607,8 +606,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] @@ -622,14 +621,14 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,6,7] ; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm11, %xmm0 @@ -638,8 +637,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm15, %xmm11 ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa %xmm7, %xmm15 @@ -650,13 +649,13 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; SSE-NEXT: pand %xmm3, %xmm0 @@ -670,13 +669,13 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE-NEXT: pandn %xmm3, %xmm12 ; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,3] ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: por %xmm0, %xmm6 @@ -684,17 +683,17 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm12, %xmm13 ; SSE-NEXT: por %xmm6, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm13 +; SSE-NEXT: pand %xmm1, %xmm13 ; SSE-NEXT: por %xmm13, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -702,14 +701,14 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] ; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, 64(%r9) ; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movdqa %xmm15, 16(%r9) @@ -1085,22 +1084,21 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm12 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm7 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm7 -; SSE-NEXT: movdqa 16(%r8), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rcx), %xmm11 +; SSE-NEXT: movdqa 16(%rcx), %xmm12 +; SSE-NEXT: movdqa 16(%r8), %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] @@ -1110,10 +1108,10 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] ; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] @@ -1124,32 +1122,34 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm0 @@ -1159,13 +1159,14 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] @@ -1173,45 +1174,45 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,1,2,1] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,1,2,3] +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm5 @@ -1219,32 +1220,33 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm7, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] ; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,4,7] ; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm4 @@ -1254,58 +1256,57 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm15 -; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm14, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] ; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,2] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 @@ -1315,27 +1316,27 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm6, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: por %xmm14, %xmm15 ; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 @@ -1344,18 +1345,18 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw $164, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] ; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: por %xmm2, %xmm14 +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2] ; SSE-NEXT: movdqa %xmm3, %xmm5 @@ -1365,24 +1366,24 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,3,3,3] -; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,2] -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm10 ; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm5, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: pand %xmm9, %xmm2 @@ -1391,7 +1392,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE-NEXT: pandn %xmm5, %xmm9 ; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0] ; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,2,4,5,6,7] @@ -1406,9 +1407,9 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: movdqa %xmm3, (%r9) -; SSE-NEXT: movdqa %xmm10, 64(%r9) +; SSE-NEXT: movdqa %xmm11, 64(%r9) ; SSE-NEXT: movdqa %xmm0, 80(%r9) -; SSE-NEXT: movdqa %xmm14, 144(%r9) +; SSE-NEXT: movdqa %xmm15, 144(%r9) ; SSE-NEXT: movdqa %xmm7, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) @@ -1425,151 +1426,148 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4],zero,xmm3[6,7,8,9],zero,xmm3[11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm1[9],zero,zero,zero,zero,xmm1[10],zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[9],zero,zero,zero,zero,xmm15[10],zero,zero,zero,zero,xmm15[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3],zero,xmm10[5,6,7,8],zero,xmm10[10,11,12,13],zero,xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,xmm1[2],zero -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm13, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3],zero,xmm14[5,6,7,8],zero,xmm14[10,11,12,13],zero,xmm14[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm15[0],zero,zero,zero,zero,xmm15[1],zero,zero,zero,zero,xmm15[2],zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[1,2,3,4],zero,xmm5[6,7,8,9],zero,xmm5[11,12,13,14],zero ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6],zero,xmm5[u,u,u,7],zero,xmm5[u,u,u,8],zero,xmm5[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u],zero,xmm10[7,u,u,u],zero,xmm10[8,u,u,u],zero,xmm10[9,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,xmm1[5,6,7,8],zero,xmm1[10,11,12,13],zero,xmm1[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[0,1,2,3],zero,xmm9[5,6,7,8],zero,xmm9[10,11,12,13],zero,xmm9[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[6,u,u,u],zero,xmm15[7,u,u,u],zero,xmm15[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm7[6,u,u,u],zero,xmm7[7,u,u,u],zero,xmm7[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2227,30 +2225,31 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-LABEL: store_i8_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $504, %rsp # imm = 0x1F8 -; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm7 ; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa 16(%rsi), %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm6 -; SSE-NEXT: movdqa 16(%rdx), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rcx), %xmm10 -; SSE-NEXT: movdqa 16(%rcx), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa 16(%rcx), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,1] @@ -2261,24 +2260,24 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] @@ -2293,16 +2292,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa 16(%r8), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rcx), %xmm0 @@ -2311,16 +2310,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 32(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 32(%rsi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,1] +; SSE-NEXT: movdqa 32(%rsi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] @@ -2332,16 +2331,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa 32(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rcx), %xmm0 @@ -2350,13 +2349,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2372,36 +2371,36 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa 48(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm9, %xmm1 @@ -2412,11 +2411,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1] ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm1 @@ -2429,13 +2428,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,7] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm5, %xmm9 @@ -2446,219 +2445,215 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm5 ; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,1,2,1] +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] ; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm6 @@ -2669,84 +2664,84 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,0,0] +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,3,2] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; SSE-NEXT: movdqa %xmm9, %xmm13 ; SSE-NEXT: pandn %xmm5, %xmm13 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm13 ; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] ; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] ; SSE-NEXT: pand %xmm12, %xmm6 ; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm5, %xmm10 ; SSE-NEXT: pand %xmm2, %xmm3 @@ -2756,53 +2751,52 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: pand %xmm7, %xmm6 ; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm14[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3] ; SSE-NEXT: movdqa %xmm12, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm12, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: pand %xmm8, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: pandn %xmm3, %xmm14 ; SSE-NEXT: pand %xmm2, %xmm11 @@ -2812,24 +2806,24 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] -; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE-NEXT: pand %xmm15, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: pand %xmm9, %xmm11 @@ -2851,15 +2845,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,3] ; SSE-NEXT: pandn %xmm11, %xmm12 ; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: pand %xmm2, %xmm1 @@ -2869,27 +2863,26 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] ; SSE-NEXT: pandn %xmm1, %xmm9 ; SSE-NEXT: por %xmm2, %xmm9 ; SSE-NEXT: movdqa %xmm9, 304(%r9) ; SSE-NEXT: movdqa %xmm0, 240(%r9) ; SSE-NEXT: movdqa %xmm6, 224(%r9) ; SSE-NEXT: movdqa %xmm14, 160(%r9) -; SSE-NEXT: movdqa %xmm8, 144(%r9) +; SSE-NEXT: movdqa %xmm7, 144(%r9) ; SSE-NEXT: movdqa %xmm10, 80(%r9) ; SSE-NEXT: movdqa %xmm13, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2925,276 +2918,272 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $104, %rsp ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[3,4,5,6],zero,xmm3[8,9,10,11],zero,xmm3[13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm9[6],zero,zero,zero,zero,xmm9[7],zero,zero,zero,zero,xmm9[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[3],zero,zero,zero,zero,xmm9[4],zero,zero,zero,zero,xmm9[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1],zero,xmm6[3,4,5,6],zero,xmm6[8,9,10,11],zero,xmm6[13,14,15] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm7[6],zero,zero,zero,zero,xmm7[7],zero,zero,zero,zero,xmm7[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7],zero,xmm4[9,10,11,12],zero,xmm4[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm7[3],zero,zero,zero,zero,xmm7[4],zero,zero,zero,zero,xmm7[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[12],zero,zero,zero,zero,xmm9[13],zero,zero,zero,zero,xmm9[14],zero,zero,zero,zero,xmm9[15] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[12],zero,zero,zero,zero,xmm7[13],zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,xmm7[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm14 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1],zero,xmm8[3,4,5,6],zero,xmm8[8,9,10,11],zero,xmm8[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm3[6],zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,xmm3[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[0,1],zero,xmm14[3,4,5,6],zero,xmm14[8,9,10,11],zero,xmm14[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm6[6],zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,xmm6[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[3],zero,zero,zero,zero,xmm3[4],zero,zero,zero,zero,xmm3[5],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3],zero,zero,zero,zero,xmm6[4],zero,zero,zero,zero,xmm6[5],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5],zero,xmm3[7,8,9,10],zero,xmm3[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm11 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 ; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3],zero,xmm13[5,6,7,8],zero,xmm13[10,11,12,13],zero,xmm13[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[12],zero,zero,zero,zero,xmm11[13],zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,xmm11[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[12],zero,zero,zero,zero,xmm13[13],zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,xmm13[15] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3],zero,xmm2[5,6,7,8],zero,xmm2[10,11,12,13],zero,xmm2[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm11[0],zero,zero,zero,zero,xmm11[1],zero,zero,zero,zero,xmm11[2],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero,xmm13[2],zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6],zero,xmm7[u,u,u,7],zero,xmm7[u,u,u,8],zero,xmm7[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 @@ -3202,22 +3191,22 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm11[9],zero,zero,zero,zero,xmm11[10],zero,zero,zero,zero,xmm11[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm13[9],zero,zero,zero,zero,xmm13[10],zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3252,18 +3241,18 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-SLOW-LABEL: store_i8_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $184, %rsp -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-SLOW-NEXT: subq $312, %rsp # imm = 0x138 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 @@ -3272,208 +3261,217 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm7 -; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm8 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,1,1,4,6,5,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,2,1,1,4,6,5,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm10 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm9, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4> +; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm15, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm11 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm15 -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd $80, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm6 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] -; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd $80, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 128(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%r9) +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 256(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $184, %rsp +; AVX2-SLOW-NEXT: addq $312, %rsp # imm = 0x138 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $168, %rsp -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 @@ -3514,7 +3512,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3523,125 +3521,126 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [6,6,6,6,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm9, %ymm15, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm15 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm13, %ymm15, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm15, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm14, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] ; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -3654,23 +3653,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 256(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 256(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FAST-NEXT: addq $168, %rsp @@ -3680,9 +3679,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-LABEL: store_i8_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> @@ -3708,7 +3707,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] @@ -3724,7 +3723,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 @@ -3735,162 +3734,162 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,3,3,6,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm15, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm12, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm14, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd $80, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp @@ -3899,368 +3898,372 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 ; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm10 -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm10 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vporq %xmm12, %xmm13, %xmm20 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm22 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm15, %ymm1, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vporq %ymm1, %ymm9, %ymm24 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512F-SLOW-NEXT: vporq %xmm3, %xmm12, %xmm21 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm22 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm23 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm20 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX512F-SLOW-NEXT: vporq %xmm1, %xmm13, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm6 +; AVX512F-SLOW-NEXT: vporq %xmm1, %xmm6, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vporq %xmm13, %xmm5, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 32(%r8), %xmm29 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm29[1,1,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm26, %ymm27, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm26, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm4[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,3,2] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm27, %ymm30, %ymm27 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm13, %zmm27 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm14[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm7[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero,ymm11[25],zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm11[26],zero,ymm11[28],zero,zero,ymm11[27],zero,ymm11[29],zero,ymm11[31],zero,zero,ymm11[30],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm15 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm15, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%r8), %xmm25 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[1,1,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandnq %ymm15, %ymm26, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,3,2] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-SLOW-NEXT: vpandnq %ymm14, %ymm27, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm12 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm11[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm25, %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm2, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm20[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm29, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,0,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm21[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm3 -; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm14, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm24, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm22[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm16, %zmm6 +; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm14, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm19, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm7, %ymm13, %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm16, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm22[2,2,3,3,6,6,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vpandq %ymm12, %ymm31, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm30, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm18[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vporq %zmm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm26 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm9[2,3,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm27 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm29[0,0,1,1] +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm7, %ymm1, %ymm12 +; AVX512F-SLOW-NEXT: vpandq %ymm7, %ymm30, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm31, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm24[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm3, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm26 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[2,3,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm27 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm25[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm25[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm28[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 192(%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 ; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm9 -; AVX512F-FAST-NEXT: vpor %ymm8, %ymm9, %ymm7 -; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm27 -; AVX512F-FAST-NEXT: vporq %xmm12, %xmm13, %xmm20 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm22 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm23 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm18 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm4 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm4 +; AVX512F-FAST-NEXT: vporq %xmm3, %xmm4, %xmm19 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm22 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm23 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[27],zero,zero,ymm8[26],zero,ymm8[28],zero,ymm8[30],zero,zero,ymm8[29],zero,ymm8[31],zero,zero +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vporq %ymm13, %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vporq %ymm1, %ymm10, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX512F-FAST-NEXT: vporq %xmm1, %xmm15, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX512F-FAST-NEXT: vporq %xmm15, %xmm5, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,1,2,2,2,2,2,2] -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm27, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <4,u,5,5,5,5,u,6,30,30,30,u,31,31,31,31> -; AVX512F-FAST-NEXT: vpermi2d %zmm31, %zmm5, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = <4,u,5,5,5,5,u,6> -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm28, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm28, %ymm29, %ymm28 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm28 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[19],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vporq %xmm0, %xmm2, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14 +; AVX512F-FAST-NEXT: vporq %xmm0, %xmm14, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [1,1,2,2,2,2,2,2] +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm14, %ymm25, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <4,u,5,5,5,5,u,6,30,30,30,u,31,31,31,31> +; AVX512F-FAST-NEXT: vpermi2d %zmm25, %zmm0, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <4,u,5,5,5,5,u,6> +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm26, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm26 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm14[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero,ymm11[25],zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm11[26],zero,ymm11[28],zero,zero,ymm11[27],zero,ymm11[29],zero,ymm11[31],zero,zero,ymm11[30],zero -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm25, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm31, %zmm5 -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm20[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm28, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm29, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm0 +; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm7 = mem[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm16, %zmm10 -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm6, %ymm14, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm24, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm22[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm6, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vpandq %ymm6, %ymm29, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm18[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm30 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm4[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 192(%r9) +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vpor %ymm15, %ymm12, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm20, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm12, %ymm11, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm21, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm22[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm23[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $248, %ymm12, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vpandq %ymm12, %ymm30, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm24[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vporq %zmm7, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm7, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm27 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm31 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -4283,55 +4286,55 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm17 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm3, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm16 ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm2 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm24, %ymm19 ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 @@ -4340,36 +4343,36 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512BW-ONLY-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm16, %ymm14, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512BW-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm6 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm17 ; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm17 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm23, %ymm26 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] @@ -4382,75 +4385,75 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm25, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm16[0,2,1,1,4,6,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,3,2] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm23 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %xmm16 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm23, %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm16[0],xmm23[0],xmm16[1],xmm23[1],xmm16[2],xmm23[2],xmm16[3],xmm23[3],xmm16[4],xmm23[4],xmm16[5],xmm23[5],xmm16[6],xmm23[6],xmm16[7],xmm23[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm16, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm20, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm21, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm16, %xmm11 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm11, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm7[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm1, %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm7, %ymm16, %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm4, %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} ; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,1,1,4,6,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,3,2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; @@ -4628,55 +4631,55 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm17 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm3, %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] ; AVX512DQBW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm2 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 ; AVX512DQBW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm2 {%k4} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm16 ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512DQBW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm2 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm24, %ymm19 ; AVX512DQBW-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 ; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 ; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 @@ -4685,36 +4688,36 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 ; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512DQBW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k3} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 ; AVX512DQBW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k2} ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> -; AVX512DQBW-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermd %ymm16, %ymm14, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 ; AVX512DQBW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512DQBW-SLOW-NEXT: # ymm17 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm6 {%k6} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm18 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm17 ; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm17 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm23, %ymm26 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] @@ -4727,75 +4730,75 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm16[0,2,1,1,4,6,5,5] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,3,2] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm23 ; AVX512DQBW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm16 ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 -; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 -; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm23, %xmm22 +; AVX512DQBW-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm16[0],xmm23[0],xmm16[1],xmm23[1],xmm16[2],xmm23[2],xmm16[3],xmm23[3],xmm16[4],xmm23[4],xmm16[5],xmm23[5],xmm16[6],xmm23[6],xmm16[7],xmm23[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm16, %xmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm16 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm20, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm21 +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm21, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm16, %xmm11 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] ; AVX512DQBW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm7[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,0,1] ; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 ; AVX512DQBW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm1, %ymm7 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm16 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vporq %ymm7, %ymm16, %ymm7 ; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm4, %ymm4 ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} ; AVX512DQBW-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,1,1,4,6,5,5] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,3,2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 ; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index aae310a41fa01..0358af331d87a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -240,60 +240,61 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm8, 32(%rax) +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, 32(%rax) ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm10, (%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf8: @@ -443,13 +444,13 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm12 ; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm13 ; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa (%r8), %xmm11 +; SSE-NEXT: movdqa (%r9), %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] @@ -464,52 +465,52 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,2,2] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,2,2] ; SSE-NEXT: pand %xmm1, %xmm8 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] ; SSE-NEXT: movdqa %xmm8, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm10, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm12[8],xmm2[9],xmm12[9],xmm2[10],xmm12[10],xmm2[11],xmm12[11],xmm2[12],xmm12[12],xmm2[13],xmm12[13],xmm2[14],xmm12[14],xmm2[15],xmm12[15] +; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm15, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[3,3,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] ; SSE-NEXT: pand %xmm1, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[2,2,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm15, %xmm5 ; SSE-NEXT: pand %xmm4, %xmm14 @@ -524,18 +525,18 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 ; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm5, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -549,8 +550,8 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: pandn %xmm0, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] @@ -563,16 +564,16 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 16(%rax) +; SSE-NEXT: movdqa %xmm9, 16(%rax) ; SSE-NEXT: movdqa %xmm14, 32(%rax) ; SSE-NEXT: movdqa %xmm3, 48(%rax) ; SSE-NEXT: movdqa %xmm15, 80(%rax) -; SSE-NEXT: movdqa %xmm12, 64(%rax) +; SSE-NEXT: movdqa %xmm10, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq @@ -802,34 +803,34 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -838,192 +839,190 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm6 ; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm12 +; SSE-NEXT: movdqa (%rsi), %xmm14 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm14 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa (%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm12 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa (%r8), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm6, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE-NEXT: movdqa (%r9), %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 ; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm15 ; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,0,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] ; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,7,7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm5, %xmm8 +; SSE-NEXT: por %xmm8, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,2] ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,0,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm8 ; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: pshuflw $161, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,0,2,2,4,5,6,7] @@ -1034,44 +1033,44 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd $0, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: pshufd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm7, %xmm13 +; SSE-NEXT: pshufd $0, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm13, %xmm7 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm9 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, 32(%rax) -; SSE-NEXT: movdqa %xmm10, 48(%rax) +; SSE-NEXT: movdqa %xmm10, 32(%rax) +; SSE-NEXT: movdqa %xmm7, 48(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm15, 112(%rax) -; SSE-NEXT: movdqa %xmm6, 160(%rax) -; SSE-NEXT: movdqa %xmm11, 176(%rax) -; SSE-NEXT: movdqa %xmm12, (%rax) -; SSE-NEXT: movdqa %xmm14, 16(%rax) +; SSE-NEXT: movdqa %xmm5, 112(%rax) +; SSE-NEXT: movdqa %xmm15, 160(%rax) +; SSE-NEXT: movdqa %xmm6, 176(%rax) +; SSE-NEXT: movdqa %xmm11, (%rax) +; SSE-NEXT: movdqa %xmm12, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1087,9 +1086,8 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -1097,8 +1095,8 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] @@ -1111,15 +1109,15 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm14[8],zero,zero,zero,zero,zero,xmm14[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm13[8],zero,zero,zero,zero,zero,xmm13[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[5],zero,zero,zero,zero,zero,xmm14[6],zero,zero,zero,zero,zero,xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[5],zero,zero,zero,zero,zero,xmm13[6],zero,zero,zero,zero,zero,xmm13[7] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] @@ -1136,114 +1134,112 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm14[10],zero,zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,xmm14[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm13[10],zero,zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero,zero,xmm13[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm13[13],zero,zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,zero,xmm13[15] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm5[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1,2],xmm8[3],xmm15[4,5],xmm8[6],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0],zero,xmm8[2,3,4,5,6],zero,xmm8[8,9,10,11,12],zero,xmm8[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5],xmm2[6],xmm15[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[0,0,1,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm15 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8],zero,xmm4[10,11,12,13,14],zero @@ -1252,7 +1248,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1260,7 +1256,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm11, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) @@ -1279,127 +1275,124 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-SLOW-NEXT: pushq %rax +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm14 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[16],ymm15[16],ymm8[17],ymm15[17],ymm8[18],ymm15[18],ymm8[19],ymm15[19],ymm8[20],ymm15[20],ymm8[21],ymm15[21],ymm8[22],ymm15[22],ymm8[23],ymm15[23] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[4],ymm10[4],ymm7[5],ymm10[5],ymm7[6],ymm10[6],ymm7[7],ymm10[7],ymm7[16],ymm10[16],ymm7[17],ymm10[17],ymm7[18],ymm10[18],ymm7[19],ymm10[19],ymm7[20],ymm10[20],ymm7[21],ymm10[21],ymm7[22],ymm10[22],ymm7[23],ymm10[23] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[16],ymm15[16],ymm7[17],ymm15[17],ymm7[18],ymm15[18],ymm7[19],ymm15[19],ymm7[20],ymm15[20],ymm7[21],ymm15[21],ymm7[22],ymm15[22],ymm7[23],ymm15[23] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm12, %ymm15, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm15, %ymm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm12, %ymm14, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm15 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm12[8],ymm2[9],ymm12[9],ymm2[10],ymm12[10],ymm2[11],ymm12[11],ymm2[12],ymm12[12],ymm2[13],ymm12[13],ymm2[14],ymm12[14],ymm2[15],ymm12[15],ymm2[24],ymm12[24],ymm2[25],ymm12[25],ymm2[26],ymm12[26],ymm2[27],ymm12[27],ymm2[28],ymm12[28],ymm2[29],ymm12[29],ymm2[30],ymm12[30],ymm2[31],ymm12[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -1411,268 +1404,281 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%rax) +; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-NEXT: subq $40, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm9 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm14 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[16],ymm12[16],ymm9[17],ymm12[17],ymm9[18],ymm12[18],ymm9[19],ymm12[19],ymm9[20],ymm12[20],ymm9[21],ymm12[21],ymm9[22],ymm12[22],ymm9[23],ymm12[23] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[16],ymm15[16],ymm9[17],ymm15[17],ymm9[18],ymm15[18],ymm9[19],ymm15[19],ymm9[20],ymm15[20],ymm9[21],ymm15[21],ymm9[22],ymm15[22],ymm9[23],ymm15[23] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm13 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[16],ymm12[16],ymm9[17],ymm12[17],ymm9[18],ymm12[18],ymm9[19],ymm12[19],ymm9[20],ymm12[20],ymm9[21],ymm12[21],ymm9[22],ymm12[22],ymm9[23],ymm12[23] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[16],ymm15[16],ymm9[17],ymm15[17],ymm9[18],ymm15[18],ymm9[19],ymm15[19],ymm9[20],ymm15[20],ymm9[21],ymm15[21],ymm9[22],ymm15[22],ymm9[23],ymm15[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2125,13 +2131,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 ; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] @@ -2144,11 +2149,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm12, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm13 @@ -2156,19 +2162,19 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 @@ -2177,24 +2183,26 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: pand %xmm14, %xmm10 ; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pand %xmm11, %xmm5 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: por %xmm5, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm8 @@ -2203,12 +2211,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2234,103 +2240,105 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm11 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movdqa 16(%rcx), %xmm5 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa 16(%r8), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa 16(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 ; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: por %xmm6, %xmm13 ; SSE-NEXT: movdqa 16(%r9), %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm13, %xmm12 ; SSE-NEXT: pand %xmm0, %xmm15 ; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,2,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm5, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm7 @@ -2344,42 +2352,41 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,5,6,7,7] +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdx), %xmm1 @@ -2388,17 +2395,17 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa 32(%r8), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa 32(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm12 @@ -2406,63 +2413,64 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm6, %xmm12 ; SSE-NEXT: movdqa 32(%r9), %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[3,3,3,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm12 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,2,2] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por %xmm12, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm9, %xmm10 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: pand %xmm10, %xmm5 ; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pandn %xmm5, %xmm8 ; SSE-NEXT: pand %xmm3, %xmm7 @@ -2480,118 +2488,117 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm7 ; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pand %xmm13, %xmm8 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa 48(%rdx), %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm9 ; SSE-NEXT: movdqa 48(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa 48(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa 48(%r8), %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa 48(%r9), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa 48(%r9), %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,0,0] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm12, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm12 ; SSE-NEXT: por %xmm12, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm13 ; SSE-NEXT: por %xmm13, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,0,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] @@ -2601,52 +2608,52 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm2, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: por %xmm2, %xmm14 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 368(%rax) -; SSE-NEXT: movdqa %xmm9, 352(%rax) +; SSE-NEXT: movdqa %xmm14, 368(%rax) +; SSE-NEXT: movdqa %xmm10, 352(%rax) ; SSE-NEXT: movdqa %xmm4, 336(%rax) ; SSE-NEXT: movdqa %xmm8, 320(%rax) ; SSE-NEXT: movdqa %xmm13, 304(%rax) ; SSE-NEXT: movdqa %xmm15, 288(%rax) -; SSE-NEXT: movdqa %xmm11, 272(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2687,190 +2694,191 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-LABEL: store_i8_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $200, %rsp -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10],zero,xmm4[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,1,2,3,4],zero,xmm5[6,7,8,9,10],zero,xmm5[12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6],zero,xmm0[8,9,10,11,12],zero,xmm0[14,15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm11[13,u],zero,zero,zero,zero,xmm11[14,u],zero,zero,zero,zero,xmm11[15,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3],xmm13[4],xmm0[5,6],xmm13[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[10,u],zero,zero,zero,zero,xmm11[11,u],zero,zero,zero,zero,xmm11[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1,2],xmm13[3],xmm8[4,5],xmm13[6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,1,2],zero,xmm8[4,5,6,7,8],zero,xmm8[10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm8[13],zero,zero,zero,zero,zero,xmm8[14],zero,zero,zero,zero,zero,xmm8[15] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4],xmm2[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[2,u],zero,zero,zero,zero,xmm11[3,u],zero,zero,zero,zero,xmm11[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5,6],zero,xmm5[8,9,10,11,12],zero,xmm5[14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 @@ -2882,174 +2890,176 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3,4],xmm5[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[10,u],zero,zero,zero,zero,xmm3[11,u],zero,zero,zero,zero,xmm3[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm2[10],zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,xmm2[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2],xmm13[3],xmm14[4,5],xmm13[6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,0,1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm10, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1,2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm6 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1,2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2],xmm3[3],xmm8[4,5],xmm3[6],xmm8[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2],zero,xmm2[4,5,6,7,8],zero,xmm2[10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3094,52 +3104,56 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: subq $664, %rsp # imm = 0x298 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 @@ -3148,257 +3162,256 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[4],ymm11[4],ymm8[5],ymm11[5],ymm8[6],ymm11[6],ymm8[7],ymm11[7],ymm8[16],ymm11[16],ymm8[17],ymm11[17],ymm8[18],ymm11[18],ymm8[19],ymm11[19],ymm8[20],ymm11[20],ymm8[21],ymm11[21],ymm8[22],ymm11[22],ymm8[23],ymm11[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[0],ymm15[1],mem[1],ymm15[2],mem[2],ymm15[3],mem[3],ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[16],mem[16],ymm15[17],mem[17],ymm15[18],mem[18],ymm15[19],mem[19],ymm15[20],mem[20],ymm15[21],mem[21],ymm15[22],mem[22],ymm15[23],mem[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[16],ymm13[16],ymm12[17],ymm13[17],ymm12[18],ymm13[18],ymm12[19],ymm13[19],ymm12[20],ymm13[20],ymm12[21],ymm13[21],ymm12[22],ymm13[22],ymm12[23],ymm13[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[16],mem[16],ymm3[17],mem[17],ymm3[18],mem[18],ymm3[19],mem[19],ymm3[20],mem[20],ymm3[21],mem[21],ymm3[22],mem[22],ymm3[23],mem[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm11, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm14, %ymm10 -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11],ymm12[12],mem[12],ymm12[13],mem[13],ymm12[14],mem[14],ymm12[15],mem[15],ymm12[24],mem[24],ymm12[25],mem[25],ymm12[26],mem[26],ymm12[27],mem[27],ymm12[28],mem[28],ymm12[29],mem[29],ymm12[30],mem[30],ymm12[31],mem[31] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm14 = ymm8[8],ymm11[8],ymm8[9],ymm11[9],ymm8[10],ymm11[10],ymm8[11],ymm11[11],ymm8[12],ymm11[12],ymm8[13],ymm11[13],ymm8[14],ymm11[14],ymm8[15],ymm11[15],ymm8[24],ymm11[24],ymm8[25],ymm11[25],ymm8[26],ymm11[26],ymm8[27],ymm11[27],ymm8[28],ymm11[28],ymm8[29],ymm11[29],ymm8[30],ymm11[30],ymm8[31],ymm11[31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm15, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15],ymm12[24],ymm13[24],ymm12[25],ymm13[25],ymm12[26],ymm13[26],ymm12[27],ymm13[27],ymm12[28],ymm13[28],ymm12[29],ymm13[29],ymm12[30],ymm13[30],ymm12[31],ymm13[31] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[8],mem[8],ymm5[9],mem[9],ymm5[10],mem[10],ymm5[11],mem[11],ymm5[12],mem[12],ymm5[13],mem[13],ymm5[14],mem[14],ymm5[15],mem[15],ymm5[24],mem[24],ymm5[25],mem[25],ymm5[26],mem[26],ymm5[27],mem[27],ymm5[28],mem[28],ymm5[29],mem[29],ymm5[30],mem[30],ymm5[31],mem[31] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3408,27 +3421,27 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: addq $664, %rsp # imm = 0x298 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 @@ -3437,204 +3450,206 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm12 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm13, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15],ymm0[24],ymm12[24],ymm0[25],ymm12[25],ymm0[26],ymm12[26],ymm0[27],ymm12[27],ymm0[28],ymm12[28],ymm0[29],ymm12[29],ymm0[30],ymm12[30],ymm0[31],ymm12[31] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm7[8],ymm14[8],ymm7[9],ymm14[9],ymm7[10],ymm14[10],ymm7[11],ymm14[11],ymm7[12],ymm14[12],ymm7[13],ymm14[13],ymm7[14],ymm14[14],ymm7[15],ymm14[15],ymm7[24],ymm14[24],ymm7[25],ymm14[25],ymm7[26],ymm14[26],ymm7[27],ymm14[27],ymm7[28],ymm14[28],ymm7[29],ymm14[29],ymm7[30],ymm14[30],ymm7[31],ymm14[31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm12 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm15, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload @@ -3642,82 +3657,82 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw (%rsp), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[16],ymm13[16],ymm15[17],ymm13[17],ymm15[18],ymm13[18],ymm15[19],ymm13[19],ymm15[20],ymm13[20],ymm15[21],ymm13[21],ymm15[22],ymm13[22],ymm15[23],ymm13[23] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3736,23 +3751,23 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 @@ -3761,204 +3776,206 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm13, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15],ymm0[24],ymm12[24],ymm0[25],ymm12[25],ymm0[26],ymm12[26],ymm0[27],ymm12[27],ymm0[28],ymm12[28],ymm0[29],ymm12[29],ymm0[30],ymm12[30],ymm0[31],ymm12[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm7[8],ymm14[8],ymm7[9],ymm14[9],ymm7[10],ymm14[10],ymm7[11],ymm14[11],ymm7[12],ymm14[12],ymm7[13],ymm14[13],ymm7[14],ymm14[14],ymm7[15],ymm14[15],ymm7[24],ymm14[24],ymm7[25],ymm14[25],ymm7[26],ymm14[26],ymm7[27],ymm14[27],ymm7[28],ymm14[28],ymm7[29],ymm14[29],ymm7[30],ymm14[30],ymm7[31],ymm14[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm15, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm14, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload @@ -3966,82 +3983,82 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw (%rsp), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[16],ymm13[16],ymm15[17],ymm13[17],ymm15[18],ymm13[18],ymm15[19],ymm13[19],ymm15[20],ymm13[20],ymm15[21],ymm13[21],ymm15[22],ymm13[22],ymm15[23],ymm13[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4060,10 +4077,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4071,572 +4088,571 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm1 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm28 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15],ymm4[24],ymm13[24],ymm4[25],ymm13[25],ymm4[26],ymm13[26],ymm4[27],ymm13[27],ymm4[28],ymm13[28],ymm4[29],ymm13[29],ymm4[30],ymm13[30],ymm4[31],ymm13[31] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31] ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm15 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm25 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[16],ymm0[16],ymm11[17],ymm0[17],ymm11[18],ymm0[18],ymm11[19],ymm0[19],ymm11[20],ymm0[20],ymm11[21],ymm0[21],ymm11[22],ymm0[22],ymm11[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15],ymm1[24],ymm15[24],ymm1[25],ymm15[25],ymm1[26],ymm15[26],ymm1[27],ymm15[27],ymm1[28],ymm15[28],ymm1[29],ymm15[29],ymm1[30],ymm15[30],ymm1[31],ymm15[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[16],ymm0[16],ymm7[17],ymm0[17],ymm7[18],ymm0[18],ymm7[19],ymm0[19],ymm7[20],ymm0[20],ymm7[21],ymm0[21],ymm7[22],ymm0[22],ymm7[23],ymm0[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15],ymm2[24],ymm15[24],ymm2[25],ymm15[25],ymm2[26],ymm15[26],ymm2[27],ymm15[27],ymm2[28],ymm15[28],ymm2[29],ymm15[29],ymm2[30],ymm15[30],ymm2[31],ymm15[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm24 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15],ymm11[24],ymm0[24],ymm11[25],ymm0[25],ymm11[26],ymm0[26],ymm11[27],ymm0[27],ymm11[28],ymm0[28],ymm11[29],ymm0[29],ymm11[30],ymm0[30],ymm11[31],ymm0[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm22 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[16],ymm0[16],ymm11[17],ymm0[17],ymm11[18],ymm0[18],ymm11[19],ymm0[19],ymm11[20],ymm0[20],ymm11[21],ymm0[21],ymm11[22],ymm0[22],ymm11[23],ymm0[23] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[16],ymm7[16],ymm12[17],ymm7[17],ymm12[18],ymm7[18],ymm12[19],ymm7[19],ymm12[20],ymm7[20],ymm12[21],ymm7[21],ymm12[22],ymm7[22],ymm12[23],ymm7[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm13[8],ymm0[8],ymm13[9],ymm0[9],ymm13[10],ymm0[10],ymm13[11],ymm0[11],ymm13[12],ymm0[12],ymm13[13],ymm0[13],ymm13[14],ymm0[14],ymm13[15],ymm0[15],ymm13[24],ymm0[24],ymm13[25],ymm0[25],ymm13[26],ymm0[26],ymm13[27],ymm0[27],ymm13[28],ymm0[28],ymm13[29],ymm0[29],ymm13[30],ymm0[30],ymm13[31],ymm0[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm22 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[16],ymm0[16],ymm13[17],ymm0[17],ymm13[18],ymm0[18],ymm13[19],ymm0[19],ymm13[20],ymm0[20],ymm13[21],ymm0[21],ymm13[22],ymm0[22],ymm13[23],ymm0[23] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[16],ymm15[16],ymm1[17],ymm15[17],ymm1[18],ymm15[18],ymm1[19],ymm15[19],ymm1[20],ymm15[20],ymm1[21],ymm15[21],ymm1[22],ymm15[22],ymm1[23],ymm15[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[16],ymm15[16],ymm2[17],ymm15[17],ymm2[18],ymm15[18],ymm2[19],ymm15[19],ymm2[20],ymm15[20],ymm2[21],ymm15[21],ymm2[22],ymm15[22],ymm2[23],ymm15[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-SLOW-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm26 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512F-SLOW-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-SLOW-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm15[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm19, %ymm15 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm6 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm10 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm19, %ymm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm10, %zmm8 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm8, %ymm3, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm9[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm30[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm21[0,0,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm12 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm20[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm17[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm11[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm16[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm12, %zmm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm29[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm30, %ymm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,0,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm28[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm21[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm13[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm16[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm9 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm15, %ymm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm9, %ymm3, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm13[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm8, %ymm6 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm9, %ymm4, %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm26[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm5 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm12 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm12 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm1 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm11 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm30 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm8, %ymm15, %ymm31 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm29 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm8, %ymm30 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm8, %ymm31 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm31[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm8 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm12, %ymm15, %ymm11 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm11[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm11 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm8 ; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm1 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm12 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm12 +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm11 = mem[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm11 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm24[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm13 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm3, %ymm6 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm13 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm10 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm3, %ymm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm12, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm13, %ymm9, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm25[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm9, %ymm7 +; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm13, %ymm9, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm25[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm22[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm2 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $200, %rsp -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-FAST-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm26 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm27 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm2 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15],ymm5[24],ymm3[24],ymm5[25],ymm3[25],ymm5[26],ymm3[26],ymm5[27],ymm3[27],ymm5[28],ymm3[28],ymm5[29],ymm3[29],ymm5[30],ymm3[30],ymm5[31],ymm3[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15],ymm15[24],ymm3[24],ymm15[25],ymm3[25],ymm15[26],ymm3[26],ymm15[27],ymm3[27],ymm15[28],ymm3[28],ymm15[29],ymm3[29],ymm15[30],ymm3[30],ymm15[31],ymm3[31] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[16],ymm2[16],ymm9[17],ymm2[17],ymm9[18],ymm2[18],ymm9[19],ymm2[19],ymm9[20],ymm2[20],ymm9[21],ymm2[21],ymm9[22],ymm2[22],ymm9[23],ymm2[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm10[8],ymm4[8],ymm10[9],ymm4[9],ymm10[10],ymm4[10],ymm10[11],ymm4[11],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15],ymm10[24],ymm4[24],ymm10[25],ymm4[25],ymm10[26],ymm4[26],ymm10[27],ymm4[27],ymm10[28],ymm4[28],ymm10[29],ymm4[29],ymm10[30],ymm4[30],ymm10[31],ymm4[31] -; AVX512F-FAST-NEXT: vmovdqa %ymm10, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15],ymm13[24],ymm11[24],ymm13[25],ymm11[25],ymm13[26],ymm11[26],ymm13[27],ymm11[27],ymm13[28],ymm11[28],ymm13[29],ymm11[29],ymm13[30],ymm11[30],ymm13[31],ymm11[31] +; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm12, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm15[0],ymm6[0],ymm15[1],ymm6[1],ymm15[2],ymm6[2],ymm15[3],ymm6[3],ymm15[4],ymm6[4],ymm15[5],ymm6[5],ymm15[6],ymm6[6],ymm15[7],ymm6[7],ymm15[16],ymm6[16],ymm15[17],ymm6[17],ymm15[18],ymm6[18],ymm15[19],ymm6[19],ymm15[20],ymm6[20],ymm15[21],ymm6[21],ymm15[22],ymm6[22],ymm15[23],ymm6[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm11 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm1, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm2, %zmm22 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[4],ymm3[4],ymm13[5],ymm3[5],ymm13[6],ymm3[6],ymm13[7],ymm3[7],ymm13[16],ymm3[16],ymm13[17],ymm3[17],ymm13[18],ymm3[18],ymm13[19],ymm3[19],ymm13[20],ymm3[20],ymm13[21],ymm3[21],ymm13[22],ymm3[22],ymm13[23],ymm3[23] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[16],ymm9[16],ymm5[17],ymm9[17],ymm5[18],ymm9[18],ymm5[19],ymm9[19],ymm5[20],ymm9[20],ymm5[21],ymm9[21],ymm5[22],ymm9[22],ymm5[23],ymm9[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[16],ymm9[16],ymm3[17],ymm9[17],ymm3[18],ymm9[18],ymm3[19],ymm9[19],ymm3[20],ymm9[20],ymm3[21],ymm9[21],ymm3[22],ymm9[22],ymm3[23],ymm9[23] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[16],ymm3[16],ymm8[17],ymm3[17],ymm8[18],ymm3[18],ymm8[19],ymm3[19],ymm8[20],ymm3[20],ymm8[21],ymm3[21],ymm8[22],ymm3[22],ymm8[23],ymm3[23] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm15 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm14 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm10 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm26 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm27 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,0,0,1] -; AVX512F-FAST-NEXT: vprold $16, %ymm20, %ymm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm13[0,0,0,1] +; AVX512F-FAST-NEXT: vprold $16, %ymm19, %ymm2 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm8, %ymm2, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm9, %zmm2 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm31[0,0,0,1] -; AVX512F-FAST-NEXT: vprold $16, %ymm28, %ymm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm29[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm10, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm30[0,0,0,1] +; AVX512F-FAST-NEXT: vprold $16, %ymm28, %ymm11 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm31[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm20[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm21[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm18[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm17[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm19[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm30[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm17[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm10 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm10, %ymm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm13, %ymm11 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm10, %ymm2, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm5 -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm10, %zmm9, %zmm11 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm10, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm8, %zmm10 +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm8 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm10 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm9, %zmm10 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm30 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm10, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm8, %ymm2, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm15[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq $234, (%rsp), %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vpermq $234, (%rsp), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm11 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm11 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm29 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm31 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm8, %ymm0, %ymm30 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm30[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm8 = mem[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm10, %ymm2, %ymm31 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm31[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm24[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm26[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm25[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm21[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm11 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm14 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm0, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm11 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm9, %zmm11 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm26[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm24[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm12, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm27[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm23[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm12, %zmm13 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm13, %ymm5 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm4 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %ymm13, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm25[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm23[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm22[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm22[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm3 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-FAST-NEXT: addq $200, %rsp +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -4853,43 +4869,43 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-FAST-LABEL: store_i8_stride6_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm8, %ymm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm1, %ymm9, %ymm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm2 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm14, %ymm4 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15],ymm7[24],ymm5[24],ymm7[25],ymm5[25],ymm7[26],ymm5[26],ymm7[27],ymm5[27],ymm7[28],ymm5[28],ymm7[29],ymm5[29],ymm7[30],ymm5[30],ymm7[31],ymm5[31] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm13, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: movl $613566756, %eax # imm = 0x24924924 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm3[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm4, %zmm4 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] ; AVX512BW-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm4[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm1[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm11, %zmm11 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] @@ -4897,160 +4913,160 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm0 {%k3} ; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm17[0],ymm7[1],ymm17[1],ymm7[2],ymm17[2],ymm7[3],ymm17[3],ymm7[4],ymm17[4],ymm7[5],ymm17[5],ymm7[6],ymm17[6],ymm7[7],ymm17[7],ymm7[16],ymm17[16],ymm7[17],ymm17[17],ymm7[18],ymm17[18],ymm7[19],ymm17[19],ymm7[20],ymm17[20],ymm7[21],ymm17[21],ymm7[22],ymm17[22],ymm7[23],ymm17[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm17 = ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15],ymm13[24],ymm11[24],ymm13[25],ymm11[25],ymm13[26],ymm11[26],ymm13[27],ymm11[27],ymm13[28],ymm11[28],ymm13[29],ymm11[29],ymm13[30],ymm11[30],ymm13[31],ymm11[31] -; AVX512BW-FAST-NEXT: vpermw %ymm17, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm17, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm18, %ymm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[16],ymm7[16],ymm12[17],ymm7[17],ymm12[18],ymm7[18],ymm12[19],ymm7[19],ymm12[20],ymm7[20],ymm12[21],ymm7[21],ymm12[22],ymm7[22],ymm12[23],ymm7[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm18[8],ymm17[8],ymm18[9],ymm17[9],ymm18[10],ymm17[10],ymm18[11],ymm17[11],ymm18[12],ymm17[12],ymm18[13],ymm17[13],ymm18[14],ymm17[14],ymm18[15],ymm17[15],ymm18[24],ymm17[24],ymm18[25],ymm17[25],ymm18[26],ymm17[26],ymm18[27],ymm17[27],ymm18[28],ymm17[28],ymm18[29],ymm17[29],ymm18[30],ymm17[30],ymm18[31],ymm17[31] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm14, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm7 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm7 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm10 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm17[0],ymm6[1],ymm17[1],ymm6[2],ymm17[2],ymm6[3],ymm17[3],ymm6[4],ymm17[4],ymm6[5],ymm17[5],ymm6[6],ymm17[6],ymm6[7],ymm17[7],ymm6[16],ymm17[16],ymm6[17],ymm17[17],ymm6[18],ymm17[18],ymm6[19],ymm17[19],ymm6[20],ymm17[20],ymm6[21],ymm17[21],ymm6[22],ymm17[22],ymm6[23],ymm17[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm17 = ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15],ymm14[24],ymm11[24],ymm14[25],ymm11[25],ymm14[26],ymm11[26],ymm14[27],ymm11[27],ymm14[28],ymm11[28],ymm14[29],ymm11[29],ymm14[30],ymm11[30],ymm14[31],ymm11[31] +; AVX512BW-FAST-NEXT: vpermw %ymm17, %ymm9, %ymm9 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm18, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm12 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] +; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm13, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm6 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm10 ; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm10, %zmm10 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm7 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm21 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm6 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm22 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm22 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm14, %xmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm21 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm12, %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm23 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm15, %xmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] -; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm24, %ymm15 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm24, %ymm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm17 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm25 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm15, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm19 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm19, %xmm20 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm20[8],xmm10[8],xmm20[9],xmm10[9],xmm20[10],xmm10[10],xmm20[11],xmm10[11],xmm20[12],xmm10[12],xmm20[13],xmm10[13],xmm20[14],xmm10[14],xmm20[15],xmm10[15] +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm17, %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm20, %xmm16 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm16[8],xmm10[8],xmm16[9],xmm10[9],xmm16[10],xmm10[10],xmm16[11],xmm10[11],xmm16[12],xmm10[12],xmm16[13],xmm10[13],xmm16[14],xmm10[14],xmm16[15],xmm10[15] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3],xmm20[4],xmm17[4],xmm20[5],xmm17[5],xmm20[6],xmm17[6],xmm20[7],xmm17[7] ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-FAST-NEXT: vpermw %ymm20, %ymm26, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm20, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm16 +; AVX512BW-FAST-NEXT: vpermw %ymm16, %ymm26, %ymm16 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm13 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = <8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm16, %xmm20 -; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm16[0],zero,xmm16[1],zero,xmm16[2],zero,xmm16[3],zero,xmm16[4],zero,xmm16[5],zero,xmm16[6],zero,xmm16[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm28, %zmm20 +; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm13, %xmm16 +; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm28, %zmm16 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-FAST-NEXT: vpermw %zmm20, %zmm28, %zmm10 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm20 +; AVX512BW-FAST-NEXT: vpermw %zmm16, %zmm28, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm16 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm20, %xmm30 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm20[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm16, %xmm30 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm30, %zmm31, %zmm30 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm31 ; AVX512BW-FAST-NEXT: vpermw %zmm30, %zmm28, %zmm30 ; AVX512BW-FAST-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm30, %zmm10 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm21, %xmm30 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm23 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm23[0],xmm30[0],xmm23[1],xmm30[1],xmm23[2],xmm30[2],xmm23[3],xmm30[3],xmm23[4],xmm30[4],xmm23[5],xmm30[5],xmm23[6],xmm30[6],xmm23[7],xmm30[7] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm22, %xmm30 +; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm23, %xmm21 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm21[0],xmm30[0],xmm21[1],xmm30[1],xmm21[2],xmm30[2],xmm21[3],xmm30[3],xmm21[4],xmm30[4],xmm21[5],xmm30[5],xmm21[6],xmm30[6],xmm21[7],xmm30[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] ; AVX512BW-FAST-NEXT: vpermw %ymm30, %ymm24, %ymm24 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm30 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm31, %xmm23 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm24, %zmm24 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm31, %xmm21 ; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm30, %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm25[8],xmm21[8],xmm25[9],xmm21[9],xmm25[10],xmm21[10],xmm25[11],xmm21[11],xmm25[12],xmm21[12],xmm25[13],xmm21[13],xmm25[14],xmm21[14],xmm25[15],xmm21[15] ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm31[0],xmm30[1],xmm31[1],xmm30[2],xmm31[2],xmm30[3],xmm31[3],xmm30[4],xmm31[4],xmm30[5],xmm31[5],xmm30[6],xmm31[6],xmm30[7],xmm31[7] ; AVX512BW-FAST-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm23 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm25, %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm23 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm21 {%k2} ; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm25, %xmm24 ; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm26 -; AVX512BW-FAST-NEXT: vpermw %zmm24, %zmm28, %zmm23 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm26, %xmm24 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm26[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm27, %zmm24 -; AVX512BW-FAST-NEXT: vpermw %zmm24, %zmm28, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm24, %zmm23 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm17 = ymm18[0],ymm17[0],ymm18[1],ymm17[1],ymm18[2],ymm17[2],ymm18[3],ymm17[3],ymm18[4],ymm17[4],ymm18[5],ymm17[5],ymm18[6],ymm17[6],ymm18[7],ymm17[7],ymm18[16],ymm17[16],ymm18[17],ymm17[17],ymm18[18],ymm17[18],ymm18[19],ymm17[19],ymm18[20],ymm17[20],ymm18[21],ymm17[21],ymm18[22],ymm17[22],ymm18[23],ymm17[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[16],ymm11[16],ymm13[17],ymm11[17],ymm13[18],ymm11[18],ymm13[19],ymm11[19],ymm13[20],ymm11[20],ymm13[21],ymm11[21],ymm13[22],ymm11[22],ymm13[23],ymm11[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] -; AVX512BW-FAST-NEXT: vpermw %zmm17, %zmm18, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm13, %zmm1 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm18, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm24 +; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm28, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm24, %xmm26 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm24[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm27, %zmm26 +; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm28, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm21 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm18 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[16],ymm11[16],ymm14[17],ymm11[17],ymm14[18],ymm11[18],ymm14[19],ymm11[19],ymm14[20],ymm11[20],ymm14[21],ymm11[21],ymm14[22],ymm11[22],ymm14[23],ymm11[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm14, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512BW-FAST-NEXT: vpermw %zmm18, %zmm19, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vpermw %zmm3, %zmm14, %zmm3 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm19, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512BW-FAST-NEXT: movl $1227133513, %eax # imm = 0x49249249 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 ; AVX512BW-FAST-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm11 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm2 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm5, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm3 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k2} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 61997b1350497..0909c3f43ea37 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -288,100 +288,100 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-LABEL: store_i8_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm10 = mem[0],zero ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm14 = mem[0],zero +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm10[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,0] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm10[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,0] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm6[8],xmm15[9],xmm6[9],xmm15[10],xmm6[10],xmm15[11],xmm6[11],xmm15[12],xmm6[12],xmm15[13],xmm6[13],xmm15[14],xmm6[14],xmm15[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] ; SSE-NEXT: movdqa %xmm12, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: packuswb %xmm6, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,2,3] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,3] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm14, %xmm6 ; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm14, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm9, %xmm14 ; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; SSE-NEXT: packuswb %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; SSE-NEXT: packuswb %xmm2, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: por %xmm9, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] ; SSE-NEXT: pand %xmm2, %xmm12 ; SSE-NEXT: pandn %xmm14, %xmm2 @@ -390,17 +390,17 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: packuswb %xmm13, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm15[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,0,0] -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE-NEXT: pandn %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,2,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] @@ -409,41 +409,40 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] ; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq %xmm0, 48(%rax) -; SSE-NEXT: movdqa %xmm9, 16(%rax) +; SSE-NEXT: movq %xmm2, 48(%rax) +; SSE-NEXT: movdqa %xmm1, 16(%rax) ; SSE-NEXT: movdqa %xmm12, 32(%rax) ; SSE-NEXT: movdqa %xmm6, (%rax) ; SSE-NEXT: retq @@ -812,23 +811,24 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp +; SSE-NEXT: subq $56, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm9 +; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa (%r8), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm8 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -839,139 +839,141 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: pandn %xmm4, %xmm10 ; SSE-NEXT: por %xmm3, %xmm10 ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa (%rax), %xmm7 +; SSE-NEXT: por %xmm10, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3] +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,0] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: por %xmm14, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 ; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,6,6,6] +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -979,168 +981,166 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: por %xmm0, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm12 -; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm7, %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm9, %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshuflw $233, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm6, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] +; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] ; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: por %xmm7, %xmm9 ; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; SSE-NEXT: pandn %xmm7, %xmm10 ; SSE-NEXT: por %xmm9, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm7, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] -; SSE-NEXT: pshuflw $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,4] -; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] +; SSE-NEXT: pandn %xmm7, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, 16(%rax) @@ -1153,7 +1153,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: addq $56, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf16: @@ -1816,694 +1816,693 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $344, %rsp # imm = 0x158 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm7 -; SSE-NEXT: movdqa 16(%r9), %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE-NEXT: subq $360, %rsp # imm = 0x168 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: movdqa 16(%r9), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 16(%rax), %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa 16(%rax), %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,1,2,3] +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,0,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa (%rcx), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: movdqa (%rax), %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa (%r9), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa (%r8), %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa (%rax), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,5,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,2] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,1,1,0] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm3 +; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm4, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm14 +; SSE-NEXT: por %xmm6, %xmm14 ; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshuflw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshufhw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm11, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 32(%rax) +; SSE-NEXT: movdqa %xmm4, 32(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm8, 112(%rax) +; SSE-NEXT: movdqa %xmm7, 112(%rax) ; SSE-NEXT: movdqa %xmm14, 176(%rax) -; SSE-NEXT: movdqa %xmm15, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2522,268 +2521,272 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: addq $344, %rsp # imm = 0x158 +; SSE-NEXT: addq $360, %rsp # imm = 0x168 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: subq $216, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,zero,xmm12[9,u,u,u,u],zero,zero,xmm12[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,7],zero,xmm11[u,u,u,u,u,8],zero,xmm11[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,7],zero,xmm5[u,u,u,u,u,8],zero,xmm5[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm2, %ymm15 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm10[4,u,u,u,u],zero,zero,xmm10[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,7],zero,xmm7[u,u,u,u,u,8],zero,xmm7[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u],zero,xmm4[7,u,u,u,u,u],zero,xmm4[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm11, %ymm15 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm12[4,u,u,u,u],zero,zero,xmm12[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[8,9],zero,xmm6[u,u,u,u,10,11],zero,xmm6[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,4,5],zero,xmm6[u,u,u,u,6,7],zero,xmm6[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0],zero,xmm11[2,3,4,5,6,7],zero,xmm11[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,u],zero,zero,xmm10[9,u,u,u,u],zero,zero,xmm10[10,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u],zero,zero,xmm10[11,u,u,u,u],zero,zero,xmm10[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[13,u,u,u,u],zero,zero,xmm10[14,u,u,u,u],zero,zero,xmm10[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u],zero,zero,xmm10[2,u,u,u,u],zero,zero,xmm10[3,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,4,5],zero,xmm6[u,u,u,u,6,7],zero,xmm6[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: addq $168, %rsp +; AVX1-ONLY-NEXT: addq $216, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3025,12 +3028,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] @@ -3061,12 +3064,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = @@ -3085,14 +3088,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 @@ -3110,10 +3113,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm13, %ymm14, %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 @@ -3127,10 +3130,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm7[1,2,3,0,1,14],zero,ymm7[0,1,0,1,14,15],zero,ymm7[15,16,17,18,19,16],zero,ymm7[30,31,16,17,16,17],zero,ymm7[31,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] @@ -3194,12 +3197,12 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] @@ -3229,14 +3232,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 @@ -3254,10 +3257,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 @@ -3277,10 +3280,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 @@ -3294,10 +3297,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm7[1,2,3,0,1,14],zero,ymm7[0,1,0,1,14,15],zero,ymm7[15,16,17,18,19,16],zero,ymm7[30,31,16,17,16,17],zero,ymm7[31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] @@ -3942,16 +3945,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE: # %bb.0: ; SSE-NEXT: subq $648, %rsp # imm = 0x288 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm14 ; SSE-NEXT: movdqa 48(%rsi), %xmm2 ; SSE-NEXT: movdqa 48(%rdx), %xmm3 -; SSE-NEXT: movdqa 48(%rcx), %xmm9 -; SSE-NEXT: movdqa 48(%r8), %xmm14 +; SSE-NEXT: movdqa 48(%rcx), %xmm10 +; SSE-NEXT: movdqa 48(%r8), %xmm5 ; SSE-NEXT: movdqa 48(%r9), %xmm8 -; SSE-NEXT: movdqa 48(%rax), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rax), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm6, %xmm0 @@ -3963,15 +3964,15 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] @@ -3979,20 +3980,22 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] ; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] @@ -4001,7 +4004,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 @@ -4011,22 +4014,22 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] @@ -4034,42 +4037,43 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm6, %xmm8 ; SSE-NEXT: pand %xmm4, %xmm8 ; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm8 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 ; SSE-NEXT: por %xmm11, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] -; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm11 @@ -4078,14 +4082,16 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm6, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] @@ -4093,38 +4099,39 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm9, %xmm14 ; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pand %xmm11, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm6, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 @@ -4138,84 +4145,87 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,2,3] +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm9 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa (%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; SSE-NEXT: movdqa (%r9), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa (%r8), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa (%rax), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa (%rax), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa 16(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4224,1065 +4234,1072 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa 16(%r9), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa 16(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: por %xmm3, %xmm6 ; SSE-NEXT: movdqa 16(%rax), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,0,3] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa 32(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: movdqa 32(%rdx), %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa 32(%r9), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa 32(%r9), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r8), %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa 32(%rax), %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $170, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa %xmm7, %xmm10 ; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,2,2] +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[1,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm10 ; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm10, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pand %xmm6, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm10 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: pandn %xmm12, %xmm9 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] +; SSE-NEXT: pandn %xmm6, %xmm10 ; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm8 ; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] -; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm8 ; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm8, %xmm15 ; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm12, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm15, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm8, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm14, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm12, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,6,7] +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pand %xmm8, %xmm14 -; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm14, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,6,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: por %xmm14, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm15, %xmm9 ; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm12, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,1,3] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,7] +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,2,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm14, %xmm10 +; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,2] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,1] -; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm11, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2] +; SSE-NEXT: por %xmm15, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm15 -; SSE-NEXT: por %xmm15, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,5,7] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm3, 368(%rax) -; SSE-NEXT: movdqa %xmm11, 352(%rax) -; SSE-NEXT: movdqa %xmm10, 336(%rax) +; SSE-NEXT: movdqa %xmm1, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 288(%rax) @@ -5299,7 +5316,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) @@ -5336,7 +5353,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $632, %rsp # imm = 0x278 +; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] @@ -5349,69 +5366,72 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,3],zero,xmm0[u,u,u,u,4,5],zero,xmm0[u,u,u] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 @@ -5424,123 +5444,119 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,6,7],zero,xmm3[u,u,u,u,8,9],zero,xmm3[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,2,3],zero,xmm3[u,u,u,u,4,5],zero,xmm3[u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u],zero,zero,xmm11[2,u,u,u,u],zero,zero,xmm11[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 @@ -5565,188 +5581,189 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u],zero,zero,xmm2[2,u,u,u,u],zero,zero,xmm2[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,4,5],zero,xmm3[u,u,u,u,6,7],zero,xmm3[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,xmm12[7,u,u,u,u,u],zero,xmm12[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,7],zero,xmm4[u,u,u,u,u,8],zero,xmm4[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[u,u,6,7,8,9],zero,xmm1[u,u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm4[9,u,u],zero,zero,zero,zero,xmm4[10,u,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,xmm1[u,6,7,8,9,10],zero,xmm1[u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero,zero,xmm2[10,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4],zero,xmm1[6,7,8,9,10,11],zero,xmm1[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm11[9],zero,zero,zero,zero,zero,zero,xmm11[10],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u],zero,xmm1[7,u,u,u,u,u],zero,xmm1[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,1,2,3,4],zero,xmm0[u,u,8,9,10,11],zero,xmm0[u,u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u],zero,zero,zero,zero,xmm4[7,u,u],zero,zero,zero,zero,xmm4[8,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,1,2,3,4,5],zero,xmm0[u,8,9,10,11,12],zero,xmm0[u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u],zero,zero,zero,zero,zero,xmm2[7,u],zero,zero,zero,zero,zero,xmm2[8,u],zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6],zero,xmm0[8,9,10,11,12,13],zero,xmm0[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[6],zero,zero,zero,zero,zero,zero,xmm11[7],zero,zero,zero,zero,zero,zero,xmm11[8],zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm5[11,u,u],zero,zero,zero,zero,xmm5[12,u,u],zero,zero,zero,zero,xmm5[13] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[u,4,5,6,7,8],zero,xmm1[u,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[11,u],zero,zero,zero,zero,zero,xmm2[12,u],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8,9],zero,xmm1[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[11],zero,zero,zero,zero,zero,zero,xmm11[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[13,u],zero,zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,zero,xmm2[15,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm11[13],zero,zero,zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,zero,zero,xmm11[15] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 @@ -5755,111 +5772,110 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[13,u,u,u,u],zero,zero,xmm13[14,u,u,u,u],zero,zero,xmm13[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[4,5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5880,13 +5896,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $632, %rsp # imm = 0x278 +; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $808, %rsp # imm = 0x328 +; AVX2-SLOW-NEXT: subq $824, %rsp # imm = 0x338 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5971,339 +5987,340 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm15, %ymm13, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm12, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm15 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm11, %ymm15, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm10, %ymm12, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm5 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm9 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,ymm6[27,20,21,26],zero,ymm6[24],zero,ymm6[26,27,26,27],zero,ymm6[25] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm11 ; AVX2-SLOW-NEXT: vpor %ymm9, %ymm11, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm10 ; AVX2-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm9, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-SLOW-NEXT: vpshufhw $190, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6326,8 +6343,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 416(%rax) -; AVX2-SLOW-NEXT: addq $808, %rsp # imm = 0x328 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 416(%rax) +; AVX2-SLOW-NEXT: addq $824, %rsp # imm = 0x338 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -6336,23 +6353,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6360,18 +6378,18 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6388,361 +6406,358 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX2-FAST-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,0,0,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm13 -; AVX2-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm14, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm13 -; AVX2-FAST-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm12 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,6] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,5,5,6] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[23],zero,ymm11[27,20,21,26],zero,ymm11[24],zero,ymm11[26,27,26,27],zero,ymm11[25] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm11, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,4,5,5,7,4,5] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vpor %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm10 -; AVX2-FAST-NEXT: vpor %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 320(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6774,23 +6789,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6798,18 +6814,18 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6826,236 +6842,244 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm14, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm12, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm15, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm10, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm15[8],mem[8],xmm15[9],mem[9],xmm15[10],mem[10],xmm15[11],mem[11],xmm15[12],mem[12],xmm15[13],mem[13],xmm15[14],mem[14],xmm15[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[17,18,19,30],zero,ymm12[28],zero,ymm12[28,29,30,31],zero,ymm12[29],zero,ymm12[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[23],zero,ymm15[27,20,21,26],zero,ymm15[24],zero,ymm15[26,27,26,27],zero,ymm15[25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[23],zero,ymm5[27,20,21,26],zero,ymm5[24],zero,ymm5[26,27,26,27],zero,ymm5[25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 @@ -7074,53 +7098,53 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm10, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm14, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm12, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm13, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm13, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm15, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm7, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 @@ -7129,48 +7153,48 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm12, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) @@ -7198,482 +7222,472 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512F-SLOW-LABEL: store_i8_stride7_vf64: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $1368, %rsp # imm = 0x558 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm7[23],zero,ymm7[21,22,23,26],zero,ymm7[24],zero,ymm7[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm7[18,19,20,21],zero,ymm7[19],zero,ymm7[25,26,27,22],zero,ymm7[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vporq %xmm2, %xmm3, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm13 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,0,1],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm1, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm12[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] -; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm31 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm22, %zmm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm15[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm15, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm15 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm15, %ymm25, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm30[2,3,2,3] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm10[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm13[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm20[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm19[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpand %ymm4, %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm23[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm11 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm9, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm17 = zmm24[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm17 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm17 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm27[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm22[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm4, %ymm16, %ymm15 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm4, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm18, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm20 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm31[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512F-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm29[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm28[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm24 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm21 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,0,1,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm4 -; AVX512F-SLOW-NEXT: vporq %ymm15, %ymm16, %ymm5 -; AVX512F-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 -; AVX512F-SLOW-NEXT: vporq %ymm22, %ymm24, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm25[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm20[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm12[0,0,1,0,4,4,5,4] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512F-SLOW-NEXT: addq $1368, %rsp # imm = 0x558 -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i8_stride7_vf64: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm6[18,19,20,21],zero,ymm6[19],zero,ymm6[25,26,27,22],zero,ymm6[20],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[0,1,14],zero,ymm14[12,13,0,1,14,15],zero,ymm14[3,12,13,2,3,16],zero,ymm14[30,31,28,29,16,17],zero,ymm14[31,18,19,28,29,18],zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vporq %xmm1, %xmm3, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,0,1],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,1,1,4,4,5,5] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm15, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm15[23],zero,ymm15[21,22,23,26],zero,ymm15[24],zero,ymm15[28,29,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm15[18,19,20,21],zero,ymm15[19],zero,ymm15[25,26,27,22],zero,ymm15[20],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[21],zero,ymm14[19],zero,zero,zero,zero,ymm14[22],zero,ymm14[20],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpandnq %ymm15, %ymm25, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm5, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm23[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm18[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,1,1,4,4,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpandq %ymm19, %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm2, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm26[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm30[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm31[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm19, %ymm21, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm19, %ymm10, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm28[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm21 = zmm10[0,1,0,1],mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm29[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm18 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, (%rsp), %ymm23 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm26 = mem[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,0,1,0] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm15, %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm10[2,2,3,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vporq %ymm23, %ymm26, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm25[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm19[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm12[0,0,1,0,4,4,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $1288, %rsp # imm = 0x508 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i8_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX512F-ONLY-FAST-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm1 @@ -7682,438 +7696,891 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero,zero,ymm15[18] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm2, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [2,2,3,3,2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm23, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm31, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm13[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm5, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm9, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm5, %ymm27, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] ; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm24, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm5, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm26 = zmm0[0,1,0,1],mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] ; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,5,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm11[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; +; AVX512DQ-SLOW-LABEL: store_i8_stride7_vf64: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $1288, %rsp # imm = 0x508 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm6[18,19,20,21],zero,ymm6[19],zero,ymm6[25,26,27,22],zero,ymm6[20],zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX512DQ-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[0,1,14],zero,ymm14[12,13,0,1,14,15],zero,ymm14[3,12,13,2,3,16],zero,ymm14[30,31,28,29,16,17],zero,ymm14[31,18,19,28,29,18],zero +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm25 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-SLOW-NEXT: vporq %xmm1, %xmm3, %xmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512DQ-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,0,1],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,1,1,4,4,5,5] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm15, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm15[23],zero,ymm15[21,22,23,26],zero,ymm15[24],zero,ymm15[28,29,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm15[18,19,20,21],zero,ymm15[19],zero,ymm15[25,26,27,22],zero,ymm15[20],zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[21],zero,ymm14[19],zero,zero,zero,zero,ymm14[22],zero,ymm14[20],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512DQ-SLOW-NEXT: vpandnq %ymm15, %ymm25, %ymm15 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm5, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm23[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm8[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm18[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,1,1,4,4,5,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm8 +; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpandq %ymm19, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vporq %zmm2, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm26[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm30[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm31[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm19, %ymm21, %ymm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm19, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm28[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3],xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %xmm11 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm21 = zmm10[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[1,1,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm14 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[1,1,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm29[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm17 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm18 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq $238, (%rsp), %ymm23 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm23 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm26 = mem[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,0,1,0] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vporq %ymm15, %ymm16, %ymm0 +; AVX512DQ-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm10[2,2,3,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vporq %ymm23, %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm25[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm19[0,1,0,1,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm12[0,0,1,0,4,4,5,4] +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $1288, %rsp # imm = 0x508 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; ; AVX512DQ-FAST-LABEL: store_i8_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX512DQ-FAST-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm1 @@ -8122,407 +8589,422 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm13 +; AVX512DQ-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero,zero,ymm15[18] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm2, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm23, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm31, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm13[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm13 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm5, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpandq %ymm5, %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 ; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] ; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm24, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm5, %zmm22 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 +; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm26 = zmm0[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,0,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm19 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,5,7,4,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm11[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm1 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) -; AVX512DQ-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) +; AVX512DQ-FAST-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -8537,9 +9019,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-SLOW-NEXT: vpermw %ymm13, %ymm2, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm11 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX512BW-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm3 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm12 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] ; AVX512BW-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm5 @@ -8551,16 +9033,16 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm11 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm10 ; AVX512BW-SLOW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm10 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm17, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 +; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm18, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm18, %ymm7 +; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm17, %ymm7 ; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm29 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm23 @@ -8568,31 +9050,31 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm20, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm22 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm21, %ymm7 +; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm7 ; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm16 ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm16[8],xmm22[8],xmm16[9],xmm22[9],xmm16[10],xmm22[10],xmm16[11],xmm22[11],xmm16[12],xmm22[12],xmm16[13],xmm22[13],xmm16[14],xmm22[14],xmm16[15],xmm22[15] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm16[8],xmm20[8],xmm16[9],xmm20[9],xmm16[10],xmm20[10],xmm16[11],xmm20[11],xmm16[12],xmm20[12],xmm16[13],xmm20[13],xmm16[14],xmm20[14],xmm16[15],xmm20[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm25 = xmm25[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm8 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm7 ; AVX512BW-SLOW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm8 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm7 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm8 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm11 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm10, %ymm10 ; AVX512BW-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm28 ; AVX512BW-SLOW-NEXT: vpshufb %ymm2, %ymm28, %ymm2 ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm31 @@ -8600,31 +9082,31 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[20],zero,ymm31[18],zero,ymm31[20,21,20,21],zero,ymm31[19],zero,ymm31[19,20,21,22],zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[20],zero,ymm28[18],zero,zero,zero,zero,ymm28[21],zero,ymm28[19],zero,zero,zero,zero,ymm28[22] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm11, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[20],zero,ymm28[18],zero,zero,zero,zero,ymm28[21],zero,ymm28[19],zero,zero,zero,zero,ymm28[22] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm10, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm4 ; AVX512BW-SLOW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm25[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm25[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] ; AVX512BW-SLOW-NEXT: movl $676341840, %esi # imm = 0x28502850 ; AVX512BW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm11 -; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm2, %ymm19 -; AVX512BW-SLOW-NEXT: vporq %ymm11, %ymm19, %ymm11 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm19 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm5[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm10 +; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm5, %ymm19 +; AVX512BW-SLOW-NEXT: vporq %ymm10, %ymm19, %ymm10 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm19 ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm3 ; AVX512BW-SLOW-NEXT: vporq %ymm19, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm19 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm19 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] @@ -8632,25 +9114,25 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm3, %zmm3 ; AVX512BW-SLOW-NEXT: movabsq $3485998880071096368, %rsi # imm = 0x3060C183060C1830 ; AVX512BW-SLOW-NEXT: kmovq %rsi, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm11 {%k3} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm10 {%k3} ; AVX512BW-SLOW-NEXT: movabsq $-4357498600088870461, %rsi # imm = 0xC3870E1C3870E1C3 ; AVX512BW-SLOW-NEXT: kmovq %rsi, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2} -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm20[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm10 {%k2} +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm21[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] ; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] ; AVX512BW-SLOW-NEXT: movl $338170920, %esi # imm = 0x14281428 ; AVX512BW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm21, %ymm3 {%k2} +; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm22, %ymm3 {%k2} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm26 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm18[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,3,3,4,6,7,7] -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512BW-SLOW-NEXT: vpshufb %ymm7, %ymm18, %ymm4 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm17, %ymm4 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm4[2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 @@ -8660,7 +9142,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm19, %zmm19 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm26, %zmm19 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm26 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm26 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] @@ -8682,24 +9164,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm19 {%k3} ; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm25[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm0 {%k2} +; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm0 {%k2} ; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] ; AVX512BW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] ; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm25 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512BW-SLOW-NEXT: vporq %ymm2, %ymm25, %ymm2 +; AVX512BW-SLOW-NEXT: vporq %ymm5, %ymm25, %ymm5 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm25 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm25 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512BW-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,3,3,4,6,7,7] ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] @@ -8724,20 +9206,20 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm25 {%k3} -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm20[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm21[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm21[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm21, %ymm5 -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm20, %ymm6 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm22[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm22, %ymm5 +; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm6 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm5, %zmm5 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm5, %zmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm5[18,19,20,21],zero,zmm5[19],zero,zmm5[25,26,27,22],zero,zmm5[20],zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm5[55],zero,zmm5[53,54,55,58],zero,zmm5[56],zero,zmm5[60,61,58,59] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[18],zero,zero,zero,zero,zmm6[21],zero,zmm6[19],zero,zero,zero,zero,zmm6[22],zero,zmm6[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm6[57],zero,zmm6[55],zero,zero,zero,zero,zmm6[58],zero,zmm6[56],zero,zero,zero,zero @@ -8747,7 +9229,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm1 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm0[20],zero,zmm0[18],zero,zero,zero,zero,zmm0[21],zero,zmm0[19],zero,zero,zero,zero,zmm0[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[57],zero,zmm0[55],zero,zero,zero,zero,zmm0[58],zero,zmm0[56],zero,zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] ; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm1 @@ -8760,82 +9242,82 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm17 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm7 -; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm4 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-SLOW-NEXT: vpermi2w %zmm30, %zmm9, %zmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm8 +; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] +; AVX512BW-SLOW-NEXT: vpermi2w %zmm30, %zmm9, %zmm11 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> ; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm27, %xmm12 -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512BW-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm7, %zmm7 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm8 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm23, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm29[0],xmm23[0],xmm29[1],xmm23[1],xmm29[2],xmm23[2],xmm29[3],xmm23[3],xmm29[4],xmm23[4],xmm29[5],xmm23[5],xmm29[6],xmm23[6],xmm29[7],xmm23[7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm2 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm22, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm20, %xmm5 ; AVX512BW-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm22[0],xmm16[0],xmm22[1],xmm16[1],xmm22[2],xmm16[2],xmm22[3],xmm16[3],xmm22[4],xmm16[4],xmm22[5],xmm16[5],xmm22[6],xmm16[6],xmm22[7],xmm16[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm2 -; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm4, %zmm0 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] ; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 384(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 192(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -8845,24 +9327,24 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: subq $200, %rsp ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa (%rax), %ymm6 -; AVX512BW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX512BW-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX512BW-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %ymm13 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm1 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm4, %ymm4 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm14 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm14, %ymm7 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm15, %ymm7 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] ; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm8 @@ -8879,89 +9361,89 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm22 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm1, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm6 +; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm1, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm7 ; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm1, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm14 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm16 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm14[8],xmm16[9],xmm14[9],xmm16[10],xmm14[10],xmm16[11],xmm14[11],xmm16[12],xmm14[12],xmm16[13],xmm14[13],xmm16[14],xmm14[14],xmm16[15],xmm14[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm26 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm26 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm11, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm11, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm12 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm13, %ymm17 -; AVX512BW-FAST-NEXT: vporq %ymm8, %ymm17, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm12, %ymm18 +; AVX512BW-FAST-NEXT: vporq %ymm6, %ymm18, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm18 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm27 = xmm27[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm27, %zmm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm27, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm22, %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm22, %ymm22 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm1, %ymm18 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %ymm27 -; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm27, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm27[20],zero,ymm27[18],zero,ymm27[20,21,20,21],zero,ymm27[19],zero,ymm27[19,20,21,22],zero +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %ymm27 +; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm27, %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm20 +; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm26 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm26 = ymm27[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm27[20],zero,ymm27[18],zero,zero,zero,zero,ymm27[21],zero,ymm27[19],zero,zero,zero,zero,ymm27[22] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm26, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm26 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm26 ; AVX512BW-FAST-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm26 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm22, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm22, %ymm17 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %ymm30 ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm30, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 +; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22],zero,ymm30[20] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm21, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm21 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm21 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm31, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm31, %ymm17 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 ; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm0, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 +; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[18,19,20,21],zero,ymm31[19],zero,ymm31[21,20,21,22],zero,ymm31[20],zero,ymm31[22,23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm23, %ymm20 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm23 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FAST-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 ; AVX512BW-FAST-NEXT: kmovq %r10, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm18 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm17 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm21 ; AVX512BW-FAST-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm17 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 (%rax), %zmm26 ; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm23[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm22[23],zero,zmm22[21,22,23,26],zero,zmm22[24],zero,zmm22[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero @@ -8971,10 +9453,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm22, %zmm28, %zmm29 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[27],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm0[60],zero,zmm0[62,63,62,63],zero,zmm0[61],zero,zmm0[63,60,61] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm31[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm31[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm22[23],zero,zero,zero,zero,zmm22[26],zero,zmm22[24],zero,zero,zero,zero,zmm22[27],zero,zmm22[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero,zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm22, %zmm22 @@ -8982,10 +9464,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm29, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm27[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm21[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm27[0,1,2,3],zmm3[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 @@ -8993,44 +9475,44 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm12, %zmm0 +; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm0 ; AVX512BW-FAST-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k3} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm11[28],zero,ymm11[30,31,30,31],zero,ymm11[29],zero,ymm11[31,28,29] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero,zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm28[0],xmm30[0],xmm28[1],xmm30[1],xmm28[2],xmm30[2],xmm28[3],xmm30[3],xmm28[4],xmm30[4],xmm28[5],xmm30[5],xmm28[6],xmm30[6],xmm28[7],xmm30[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm27 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm27 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] ; AVX512BW-FAST-NEXT: vporq %ymm0, %ymm27, %ymm27 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm31 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm1[0],xmm31[1],xmm1[1],xmm31[2],xmm1[2],xmm31[3],xmm1[3],xmm31[4],xmm1[4],xmm31[5],xmm1[5],xmm31[6],xmm1[6],xmm31[7],xmm1[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm27 {%k2} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm15[27],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm24[27],zero,zero,zero,zero,ymm24[30],zero,ymm24[28],zero,zero,zero,zero,ymm24[31],zero,ymm24[29] -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm24[27],zero,zero,zero,zero,ymm24[30],zero,ymm24[28],zero,zero,zero,zero,ymm24[31],zero,ymm24[29] +; AVX512BW-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm2 ; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %xmm4 ; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] @@ -9069,7 +9551,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm28, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm12, %zmm4 +; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm4 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 @@ -9078,47 +9560,47 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm24 {%k2} ; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm16, %xmm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3],xmm14[4],xmm16[4],xmm14[5],xmm16[5],xmm14[6],xmm16[6],xmm14[7],xmm16[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 ; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm19, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm17, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm18, %xmm3 ; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm18[0],xmm19[0],xmm18[1],xmm19[1],xmm18[2],xmm19[2],xmm18[3],xmm19[3],xmm18[4],xmm19[4],xmm18[5],xmm19[5],xmm18[6],xmm19[6],xmm18[7],xmm19[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm2[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm0 -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm10, %xmm1 -; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm25[0],xmm10[1],xmm25[1],xmm10[2],xmm25[2],xmm10[3],xmm25[3],xmm10[4],xmm25[4],xmm10[5],xmm25[5],xmm10[6],xmm25[6],xmm10[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 32-byte Folded Reload +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm10, %xmm2 +; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm25[0],xmm10[1],xmm25[1],xmm10[2],xmm25[2],xmm10[3],xmm25[3],xmm10[4],xmm25[4],xmm10[5],xmm25[5],xmm10[6],xmm25[6],xmm10[7],xmm25[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm0[19],zero,zmm0[21,20,21,22],zero,zmm0[20],zero,zmm0[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm0[55],zero,zero,zero,zero,zmm0[58],zero,zmm0[56],zero,zero,zero,zero,zmm0[59],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm3, %zmm3 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 32-byte Folded Reload @@ -9127,30 +9609,30 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm21, %zmm4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm0[18],zero,zmm0[20,21,20,21],zero,zmm0[19],zero,zmm0[19,20,21,22],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm0[55],zero,zmm0[55,56,57,58],zero,zmm0[56],zero,zmm0[62,63] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm4, %zmm0 +; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm4, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm4, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm4, %zmm2 ; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) ; AVX512BW-FAST-NEXT: addq $200, %rsp ; AVX512BW-FAST-NEXT: vzeroupper @@ -9180,10 +9662,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} -; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} -; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index cfb971dd16217..f8a3ab852b092 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -1177,16 +1177,17 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: subq $232, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rcx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r8), %xmm13 ; SSE-NEXT: movdqa (%r9), %xmm12 ; SSE-NEXT: movdqa (%r10), %xmm14 @@ -1197,66 +1198,64 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[2,1,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -1265,139 +1264,142 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[8],mem[8],xmm11[9],mem[9],xmm11[10],mem[10],xmm11[11],mem[11],xmm11[12],mem[12],xmm11[13],mem[13],xmm11[14],mem[14],xmm11[15],mem[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa 16(%r10), %xmm11 +; SSE-NEXT: movdqa 16(%r10), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 16(%rax), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa 16(%r8), %xmm13 -; SSE-NEXT: movdqa 16(%r9), %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm12 +; SSE-NEXT: movdqa 16(%r9), %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa 16(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa 16(%rdx), %xmm13 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 ; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm15, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -1406,42 +1408,42 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -1449,133 +1451,133 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[3,3,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: por %xmm12, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm6, 224(%rax) -; SSE-NEXT: movdqa %xmm5, 240(%rax) +; SSE-NEXT: movdqa %xmm10, 240(%rax) ; SSE-NEXT: movdqa %xmm4, 160(%rax) -; SSE-NEXT: movdqa %xmm9, 176(%rax) +; SSE-NEXT: movdqa %xmm2, 176(%rax) ; SSE-NEXT: movdqa %xmm0, 96(%rax) ; SSE-NEXT: movdqa %xmm3, 112(%rax) ; SSE-NEXT: movdqa %xmm1, 32(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) @@ -1593,7 +1595,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $216, %rsp +; SSE-NEXT: addq $232, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf32: @@ -1601,10 +1603,9 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: subq $72, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1612,8 +1613,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 @@ -1622,237 +1623,236 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm14 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm2, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4],ymm5[5],ymm13[6],ymm5[7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -1889,151 +1889,151 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm8, %ymm14 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm15 ; AVX2-SLOW-NEXT: vmovaps 16(%r10), %xmm8 ; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7,8],ymm14[9],ymm10[10,11,12],ymm14[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2],ymm0[3],ymm14[4],ymm0[5],ymm14[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7,8],ymm15[9],ymm9[10,11,12],ymm15[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5,6],ymm1[7],ymm12[8,9,10],ymm1[11],ymm12[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm15[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm13, %ymm13 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7,8],ymm13[9],ymm12[10,11,12],ymm13[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm1[1],ymm12[2],ymm1[3],ymm12[4],ymm1[5],ymm12[6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7,8],ymm13[9],ymm1[10,11,12],ymm13[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7,8],ymm9[9],ymm7[10,11,12],ymm9[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm9 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3],ymm12[4,5,6],ymm7[7],ymm12[8,9,10],ymm7[11],ymm12[12,13,14],ymm7[15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm13, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3],ymm13[4,5,6],ymm7[7],ymm13[8,9,10],ymm7[11],ymm13[12,13,14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm14[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,6,5,7,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] @@ -2041,13 +2041,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] @@ -2098,134 +2098,134 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: subq $72, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm7 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm6 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5,6],ymm3[7],ymm10[8,9,10],ymm3[11],ymm10[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm0 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7],ymm6[8,9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm5 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm5 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm11, %xmm5 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] @@ -2251,134 +2251,134 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5,6],ymm3[7],ymm10[8,9,10],ymm3[11],ymm10[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7],ymm6[8,9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] @@ -2408,137 +2408,137 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rax), %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7],ymm10[8,9,10],ymm2[11],ymm10[12,13,14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7],ymm7[8,9,10],ymm1[11],ymm7[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 @@ -2556,14 +2556,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] @@ -2581,8 +2581,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] @@ -2609,12 +2609,12 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -2633,8 +2633,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 @@ -2647,19 +2647,19 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm28 @@ -2673,63 +2673,63 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3],ymm15[4,5,6],ymm0[7],ymm15[8,9,10],ymm0[11],ymm15[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm14[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7,8],ymm8[9],ymm15[10,11,12],ymm8[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 @@ -2742,13 +2742,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 @@ -2756,11 +2756,11 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm6 @@ -2771,7 +2771,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm25, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm20, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} @@ -3017,21 +3017,21 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> ; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm10 ; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm11 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] @@ -3055,69 +3055,70 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: movw $-21846, %r11w # imm = 0xAAAA ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm29 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm2 ; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7],ymm8[8,9,10],ymm1[11],ymm8[12,13,14],ymm1[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5,6],ymm1[7],ymm14[8,9,10],ymm1[11],ymm14[12,13,14],ymm1[15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5,6],ymm0[7],ymm8[8,9,10],ymm0[11],ymm8[12,13,14],ymm0[15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6],ymm0[7],ymm14[8,9,10],ymm0[11],ymm14[12,13,14],ymm0[15] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 @@ -3127,26 +3128,26 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm4 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm4 ; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] ; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -3564,25 +3565,25 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa 16(%r9), %xmm9 +; SSE-NEXT: movdqa 16(%r9), %xmm6 ; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: movdqa 16(%rdx), %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm6 +; SSE-NEXT: movdqa 16(%rcx), %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm4, %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm8 +; SSE-NEXT: movdqa 16(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm7 @@ -3657,24 +3658,24 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm10 ; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] @@ -3737,34 +3738,34 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r10), %xmm0 -; SSE-NEXT: movdqa 32(%rax), %xmm9 +; SSE-NEXT: movdqa 32(%rax), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa 32(%r8), %xmm3 -; SSE-NEXT: movdqa 32(%r9), %xmm8 +; SSE-NEXT: movdqa 32(%r9), %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 ; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: movdqa 32(%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rcx), %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm5, %xmm15 ; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm7 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 ; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm6 @@ -3833,27 +3834,27 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm9, %xmm6 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] @@ -3925,28 +3926,28 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa 48(%r8), %xmm7 -; SSE-NEXT: movdqa 48(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa 48(%r8), %xmm8 +; SSE-NEXT: movdqa 48(%r9), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: movdqa 48(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa 48(%rdx), %xmm7 +; SSE-NEXT: movdqa 48(%rcx), %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa 48(%rsi), %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] @@ -3960,11 +3961,11 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm10 ; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm14, %xmm15 @@ -3979,11 +3980,11 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSE-NEXT: pand %xmm13, %xmm10 ; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm14, %xmm2 @@ -3998,18 +3999,18 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] @@ -4019,85 +4020,85 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] ; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pand %xmm13, %xmm8 ; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: por %xmm7, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: por %xmm8, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm1, 496(%rax) -; SSE-NEXT: movdqa %xmm5, 480(%rax) +; SSE-NEXT: movdqa %xmm3, 480(%rax) ; SSE-NEXT: movdqa %xmm0, 464(%rax) -; SSE-NEXT: movdqa %xmm3, 448(%rax) +; SSE-NEXT: movdqa %xmm4, 448(%rax) ; SSE-NEXT: movdqa %xmm6, 432(%rax) ; SSE-NEXT: movdqa %xmm10, 416(%rax) ; SSE-NEXT: movdqa %xmm15, 400(%rax) @@ -4168,357 +4169,357 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm4, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm15 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm4, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0],ymm13[1],ymm2[2],ymm13[3],ymm2[4],ymm13[5],ymm2[6],ymm13[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4],ymm15[5],ymm3[6],ymm15[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] @@ -4529,131 +4530,131 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4],ymm2[5],ymm6[6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2],ymm2[3],ymm7[4],ymm2[5],ymm7[6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4759,41 +4760,41 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7,8],ymm12[9],ymm14[10,11,12],ymm12[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] @@ -4810,12 +4811,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] @@ -4832,64 +4833,64 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] @@ -4899,148 +4900,148 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7],ymm2[8,9,10],ymm5[11],ymm2[12,13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm9 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4],ymm9[5],ymm1[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7,8],ymm14[9],ymm2[10,11,12],ymm14[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm11[1],ymm2[2],ymm11[3],ymm2[4],ymm11[5],ymm2[6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7],ymm8[8,9,10],ymm1[11],ymm8[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7,8],ymm0[9],ymm8[10,11,12],ymm0[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] @@ -5066,12 +5067,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7,8],ymm10[9],ymm6[10,11,12],ymm10[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3,4],ymm9[5],ymm6[6,7,8],ymm9[9],ymm6[10,11,12],ymm9[13],ymm6[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] @@ -5096,10 +5097,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 192(%rax) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5143,281 +5144,281 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm13 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm13 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm3 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7,8],ymm12[9],ymm1[10,11,12],ymm12[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm5 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7,8],ymm15[9],ymm11[10,11,12],ymm15[13],ymm11[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7,8],ymm6[9],ymm1[10,11,12],ymm6[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm0 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2],ymm3[3],ymm11[4],ymm3[5],ymm11[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm2 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm0 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm10 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7,8],ymm9[9],ymm11[10,11,12],ymm9[13],ymm11[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7,8],ymm5[9],ymm1[10,11,12],ymm5[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm9 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4],ymm6[5],ymm9[6],ymm6[7] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -5467,281 +5468,281 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7,8],ymm12[9],ymm1[10,11,12],ymm12[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7,8],ymm15[9],ymm11[10,11,12],ymm15[13],ymm11[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7,8],ymm6[9],ymm1[10,11,12],ymm6[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2],ymm3[3],ymm11[4],ymm3[5],ymm11[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7,8],ymm9[9],ymm11[10,11,12],ymm9[13],ymm11[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7,8],ymm5[9],ymm1[10,11,12],ymm5[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4],ymm6[5],ymm9[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -5774,483 +5775,479 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512F-SLOW-LABEL: store_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX512F-SLOW-NEXT: subq $648, %rsp # imm = 0x288 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%r10), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm16 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm14, %ymm28 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm2, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm14 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm14, %ymm31 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm24 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm22 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm20 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm18 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm25 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm30 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm23 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm21 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm7, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm20 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm12[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm13, %ymm19 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm13, %ymm17 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm10, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm31 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm27 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm16 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm24 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vmovdqa 48(%rsi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm5 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpandnq %zmm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm6 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpandnq %zmm4, %zmm8, %zmm4 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vpord %zmm4, %zmm7, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpord %zmm4, %zmm5, %zmm6 {%k1} ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm16 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm16 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpord %zmm2, %zmm4, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm14 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm9 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd $96, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm2, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm25[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm17 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpord %zmm2, %zmm3, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm25 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm11, %ymm10, %ymm20 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm10, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm12, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpandnq %zmm5, %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpord %zmm5, %zmm7, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm5, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm21 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm5, %ymm22 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm10, %ymm14 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm10, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm10, %ymm17 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm10, %ymm20 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rsi), %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[2,3,2,3] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm7[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm30, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm13 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm3, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vpandnq %zmm13, %zmm8, %zmm13 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm23, %zmm23 +; AVX512F-SLOW-NEXT: vpord %zmm13, %zmm23, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm12[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm24, %ymm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm10[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm10[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 ; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpandnq %zmm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm31[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm28, %zmm28 -; AVX512F-SLOW-NEXT: vpord %zmm7, %zmm28, %zmm13 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm15[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm15[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm30, %ymm28 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm29, %ymm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm29 = xmm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm12[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm26 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm15, %zmm5, %zmm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm7[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm19 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 -; AVX512F-SLOW-NEXT: vpandnq %zmm22, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm18, %zmm18 -; AVX512F-SLOW-NEXT: vpord %zmm19, %zmm18, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm18, %ymm18 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm31, %ymm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm19 = ymm25[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm5, %zmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm2, %zmm18 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm28, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 +; AVX512F-SLOW-NEXT: vpandnq %zmm25, %zmm8, %zmm25 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vpord %zmm25, %zmm16, %zmm13 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm29, %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 +; AVX512F-SLOW-NEXT: vpandnq %zmm10, %zmm8, %zmm10 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpord %zmm10, %zmm7, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm0, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512F-SLOW-NEXT: addq $648, %rsp # imm = 0x288 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -6259,294 +6256,288 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm14 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm4[8],xmm13[9],xmm4[9],xmm13[10],xmm4[10],xmm13[11],xmm4[11],xmm13[12],xmm4[12],xmm13[13],xmm4[13],xmm13[14],xmm4[14],xmm13[15],xmm4[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22 ; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm23 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm23 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm20 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28 ; AVX512F-FAST-NEXT: vmovdqa 16(%r9), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm16 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm29 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm30 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 ; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm6 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpandnq %zmm29, %zmm0, %zmm29 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm17, %zmm17 +; AVX512F-FAST-NEXT: vpandnq %zmm19, %zmm0, %zmm19 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm21, %zmm21 ; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vpord %zmm29, %zmm17, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vpord %zmm19, %zmm21, %zmm4 {%k1} ; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm17 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm19 ; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %ymm21, %ymm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm6 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm3 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm4 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm28, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm15 +; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm1 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm15, %ymm4 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm14 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm5 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm5, %ymm19, %ymm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm14 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq (%rsp), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm21, %zmm6, %zmm1 -; AVX512F-FAST-NEXT: vpandnq %zmm22, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm23, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm6, %zmm3 -; AVX512F-FAST-NEXT: vpandnq %zmm25, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm26, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpandnq %zmm20, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm16, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm30, %zmm6, %zmm5 -; AVX512F-FAST-NEXT: vpandnq %zmm31, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm13, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm17, %zmm6, %zmm9 -; AVX512F-FAST-NEXT: vpandnq %zmm29, %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm28, %zmm0 -; AVX512F-FAST-NEXT: vpord %zmm6, %zmm0, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm14 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq (%rsp), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm20, %zmm3, %zmm11 +; AVX512F-FAST-NEXT: vpandnq %zmm22, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm23, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm3, %zmm14 +; AVX512F-FAST-NEXT: vpandnq %zmm25, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm26, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpandnq %zmm28, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm29, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm30, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vpandnq %zmm31, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm16, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm3, %zmm5 +; AVX512F-FAST-NEXT: vpandnq %zmm13, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm21, %zmm0 +; AVX512F-FAST-NEXT: vpord %zmm3, %zmm0, %zmm5 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512F-FAST-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -6557,226 +6548,228 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r10), %xmm21 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r10), %xmm22 ; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r10), %xmm19 ; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rax), %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm23 ; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rax), %xmm20 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm23 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r8), %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm26 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r9), %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm21 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r8), %xmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm24 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] ; AVX512BW-SLOW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm29 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm30 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rcx), %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa 48(%rcx), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] ; AVX512BW-SLOW-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm5 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm17, %zmm17, %zmm17 -; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm7, %zmm17 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm17 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm12, %zmm3 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm15 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 +; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm10, %zmm14 +; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm9, %zmm14 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm15[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm18, %ymm18 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm27, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm31[0],xmm1[1],xmm31[1],xmm1[2],xmm31[2],xmm1[3],xmm31[3],xmm1[4],xmm31[4],xmm1[5],xmm31[5],xmm1[6],xmm31[6],xmm1[7],xmm31[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm18 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm30 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm29 = xmm15[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm27, %ymm27 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm29, %ymm15 +; AVX512BW-SLOW-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm27, %zmm27 +; AVX512BW-SLOW-NEXT: vpermw %zmm27, %zmm12, %zmm15 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm31 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15] ; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm25 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm21 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm20, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm7, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm19 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm25 = xmm20[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm20[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm26[0],zero,zero,zero,xmm26[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm10, %zmm19 +; AVX512BW-SLOW-NEXT: vpermw %zmm21, %zmm9, %zmm19 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm20[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm20[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm26, %ymm20 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm31[8],xmm1[9],xmm31[9],xmm1[10],xmm31[10],xmm1[11],xmm31[11],xmm1[12],xmm31[12],xmm1[13],xmm31[13],xmm1[14],xmm31[14],xmm1[15],xmm31[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm14, %zmm20 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm24, %ymm20 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm20 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm25, %zmm25, %zmm25 -; AVX512BW-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm25 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm8, %zmm25 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm30[0],xmm0[1],xmm30[1],xmm0[2],xmm30[2],xmm0[3],xmm30[3],xmm0[4],xmm30[4],xmm0[5],xmm30[5],xmm0[6],xmm30[6],xmm0[7],xmm30[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm26[0],zero,zero,zero,xmm26[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm1[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm26, %ymm26 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm21 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm21 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm28, %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm31 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm1, %zmm26 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm14, %zmm26 {%k2} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm27[0],xmm2[1],xmm27[1],xmm2[2],xmm27[2],xmm2[3],xmm27[3],xmm2[4],xmm27[4],xmm2[5],xmm27[5],xmm2[6],xmm27[6],xmm2[7],xmm27[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm24 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm29 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm23 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm30 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm25 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 -; AVX512BW-SLOW-NEXT: vpermw %zmm21, %zmm7, %zmm21 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm8, %zmm21 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm30[8],xmm0[9],xmm30[9],xmm0[10],xmm30[10],xmm0[11],xmm30[11],xmm0[12],xmm30[12],xmm0[13],xmm30[13],xmm0[14],xmm30[14],xmm0[15],xmm30[15] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm22 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm22 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm22 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm22, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm23, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm27[8],xmm2[9],xmm27[9],xmm2[10],xmm27[10],xmm2[11],xmm27[11],xmm2[12],xmm27[12],xmm2[13],xmm27[13],xmm2[14],xmm27[14],xmm2[15],xmm27[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm22 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm23 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm13[0],xmm16[1],xmm13[1],xmm16[2],xmm13[2],xmm16[3],xmm13[3],xmm16[4],xmm13[4],xmm16[5],xmm13[5],xmm16[6],xmm13[6],xmm16[7],xmm13[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm24 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm24 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm26 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm26 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm27, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm30 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm25[0],xmm2[1],xmm25[1],xmm2[2],xmm25[2],xmm2[3],xmm25[3],xmm2[4],xmm25[4],xmm2[5],xmm25[5],xmm2[6],xmm25[6],xmm2[7],xmm25[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm27 {%k2} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512BW-SLOW-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 -; AVX512BW-SLOW-NEXT: vpermw %zmm3, %zmm7, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm4 -; AVX512BW-SLOW-NEXT: vpermw %zmm4, %zmm8, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm27 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm13[8],xmm16[9],xmm13[9],xmm16[10],xmm13[10],xmm16[11],xmm13[11],xmm16[12],xmm13[12],xmm16[13],xmm13[13],xmm16[14],xmm13[14],xmm16[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm13 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm10, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm13, %zmm9, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX512BW-SLOW-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512BW-SLOW-NEXT: vpermw %zmm5, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm7 +; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm9, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm30[8],xmm23[8],xmm30[9],xmm23[9],xmm30[10],xmm23[10],xmm30[11],xmm23[11],xmm30[12],xmm23[12],xmm30[13],xmm23[13],xmm30[14],xmm23[14],xmm30[15],xmm23[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermw %zmm4, %zmm14, %zmm0 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm7, %ymm7 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm25[8],xmm2[9],xmm25[9],xmm2[10],xmm25[10],xmm2[11],xmm25[11],xmm2[12],xmm25[12],xmm2[13],xmm25[13],xmm2[14],xmm25[14],xmm2[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vpermw %zmm2, %zmm12, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX512BW-SLOW-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vpermw %zmm6, %zmm14, %zmm4 {%k2} +; AVX512BW-SLOW-NEXT: vpermw %zmm6, %zmm12, %zmm2 {%k2} ; AVX512BW-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm14, %zmm15 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm25, %zmm26 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm21, %zmm22 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm24, %zmm27 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm21, %zmm24 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm22, %zmm23 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm26, %zmm27 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -6785,7 +6778,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm14 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%r10), %xmm18 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%r10), %xmm17 ; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm1 @@ -6820,16 +6813,16 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%rsi), %xmm25 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm28 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm28 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm13 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm26 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm27 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm8, %zmm8 ; AVX512BW-FAST-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm8 {%k2} @@ -6845,22 +6838,22 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm16, %xmm27 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm25[0],xmm28[1],xmm25[1],xmm28[2],xmm25[2],xmm28[3],xmm25[3],xmm28[4],xmm25[4],xmm28[5],xmm25[5],xmm28[6],xmm25[6],xmm28[7],xmm25[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm16, %xmm26 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm27, %ymm26 ; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdx), %xmm30 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm27 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm16, %ymm16 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm16 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm26, %ymm26 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm26, %zmm26 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm26, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm26, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm26 ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm16 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm29 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm27 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm24 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] @@ -6872,15 +6865,15 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm28 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm29 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm23[8],xmm30[9],xmm23[9],xmm30[10],xmm23[10],xmm30[11],xmm23[11],xmm30[12],xmm23[12],xmm30[13],xmm23[13],xmm30[14],xmm23[14],xmm30[15],xmm23[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm17, %zmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm28[8],xmm25[8],xmm28[9],xmm25[9],xmm28[10],xmm25[10],xmm28[11],xmm25[11],xmm28[12],xmm25[12],xmm28[13],xmm25[13],xmm28[14],xmm25[14],xmm28[15],xmm25[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm23, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm17, %xmm25 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm23, %ymm23 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm17, %xmm25 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm17 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 @@ -6890,96 +6883,96 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm22, %zmm22 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm22, %zmm22 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm21[0],xmm28[1],xmm21[1],xmm28[2],xmm21[2],xmm28[3],xmm21[3],xmm28[4],xmm21[4],xmm28[5],xmm21[5],xmm28[6],xmm21[6],xmm28[7],xmm21[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm20, %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm21[0],xmm29[1],xmm21[1],xmm29[2],xmm21[2],xmm29[3],xmm21[3],xmm29[4],xmm21[4],xmm29[5],xmm21[5],xmm29[6],xmm21[6],xmm29[7],xmm21[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm23 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm30 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm28 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm20, %ymm20 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm20, %ymm20 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm23, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm30[0],xmm24[0],xmm30[1],xmm24[1],xmm30[2],xmm24[2],xmm30[3],xmm24[3],xmm30[4],xmm24[4],xmm30[5],xmm24[5],xmm30[6],xmm24[6],xmm30[7],xmm24[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm23, %zmm23 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm23, %zmm23 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm23, %zmm20 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa64 16(%r9), %xmm23 ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm22, %zmm20 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm25 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rcx), %xmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm29[8],xmm27[8],xmm29[9],xmm27[9],xmm29[10],xmm27[10],xmm29[11],xmm27[11],xmm29[12],xmm27[12],xmm29[13],xmm27[13],xmm29[14],xmm27[14],xmm29[15],xmm27[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rcx), %xmm19 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm22 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm25, %ymm25 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm25, %zmm25 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm25, %zmm27 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm26, %ymm26 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm26, %zmm26 +; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm26, %zmm27 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm24[8],xmm30[9],xmm24[9],xmm30[10],xmm24[10],xmm30[11],xmm24[11],xmm30[12],xmm24[12],xmm30[13],xmm24[13],xmm30[14],xmm24[14],xmm30[15],xmm24[15] +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm26 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm18, %zmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm21[8],xmm28[9],xmm21[9],xmm28[10],xmm21[10],xmm28[11],xmm21[11],xmm28[12],xmm21[12],xmm28[13],xmm21[13],xmm28[14],xmm21[14],xmm28[15],xmm21[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm29[8],xmm21[8],xmm29[9],xmm21[9],xmm29[10],xmm21[10],xmm29[11],xmm21[11],xmm29[12],xmm21[12],xmm29[13],xmm21[13],xmm29[14],xmm21[14],xmm29[15],xmm21[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm18, %xmm28 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm21, %ymm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm18, %xmm28 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm18 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm21, %zmm21 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm23[0],xmm26[1],xmm23[1],xmm26[2],xmm23[2],xmm26[3],xmm23[3],xmm26[4],xmm23[4],xmm26[5],xmm23[5],xmm26[6],xmm23[6],xmm26[7],xmm23[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm25[0],xmm23[0],xmm25[1],xmm23[1],xmm25[2],xmm23[2],xmm25[3],xmm23[3],xmm25[4],xmm23[4],xmm25[5],xmm23[5],xmm25[6],xmm23[6],xmm25[7],xmm23[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm21, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm19[0],xmm25[1],xmm19[1],xmm25[2],xmm19[2],xmm25[3],xmm19[3],xmm25[4],xmm19[4],xmm25[5],xmm19[5],xmm25[6],xmm19[6],xmm25[7],xmm19[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm21, %xmm27 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm21, %xmm27 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 ; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdx), %xmm28 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm21 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm21, %ymm21 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm22[0],xmm28[1],xmm22[1],xmm28[2],xmm22[2],xmm28[3],xmm22[3],xmm28[4],xmm22[4],xmm28[5],xmm22[5],xmm28[6],xmm22[6],xmm28[7],xmm22[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm19[0],xmm28[1],xmm19[1],xmm28[2],xmm19[2],xmm28[3],xmm19[3],xmm28[4],xmm19[4],xmm28[5],xmm19[5],xmm28[6],xmm19[6],xmm28[7],xmm19[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm21 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm21 {%k3} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm26[8],xmm23[8],xmm26[9],xmm23[9],xmm26[10],xmm23[10],xmm26[11],xmm23[11],xmm26[12],xmm23[12],xmm26[13],xmm23[13],xmm26[14],xmm23[14],xmm26[15],xmm23[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm14, %zmm14 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm15 ; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm15 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm28[8],xmm22[8],xmm28[9],xmm22[9],xmm28[10],xmm22[10],xmm28[11],xmm22[11],xmm28[12],xmm22[12],xmm28[13],xmm22[13],xmm28[14],xmm22[14],xmm28[15],xmm22[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm25[8],xmm19[8],xmm25[9],xmm19[9],xmm25[10],xmm19[10],xmm25[11],xmm19[11],xmm25[12],xmm19[12],xmm25[13],xmm19[13],xmm25[14],xmm19[14],xmm25[15],xmm19[15] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm19, %xmm22 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm28[8],xmm19[8],xmm28[9],xmm19[9],xmm28[10],xmm19[10],xmm28[11],xmm19[11],xmm28[12],xmm19[12],xmm28[13],xmm19[13],xmm28[14],xmm19[14],xmm28[15],xmm19[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm22 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm23, %ymm22 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm19 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm19 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm19 {%k2} +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm19 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm19 {%k3} ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -6996,8 +6989,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm4 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index a724babe469c5..3c5e3adf038fa 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -214,17 +214,17 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -234,8 +234,8 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] @@ -301,17 +301,17 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -321,8 +321,8 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll index 4c4a2fc8dcabe..fe2c41f57cfab 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -136,63 +136,61 @@ define float @test_v3f32(<3 x float> %a0) { define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: cmpunordss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: maxss %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: maxss %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: andnps %xmm3, %xmm1 -; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: andnps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: maxss %xmm0, %xmm3 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: andnps %xmm3, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm1 -; SSE2-NEXT: orps %xmm4, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: maxss %xmm1, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: maxss %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm0, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: maxss %xmm0, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: maxss %xmm1, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: andnps %xmm3, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: andnps %xmm3, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 ; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: maxss %xmm1, %xmm3 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: maxss %xmm1, %xmm2 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 ; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm0, %xmm1 -; SSE41-NEXT: orps %xmm3, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll index 301629f033dbc..ec41657d2f248 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll @@ -729,15 +729,15 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-LABEL: test_v16f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: movaps %xmm1, %xmm5 +; SSE41-NEXT: movaps %xmm1, %xmm6 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm6 -; SSE41-NEXT: maxps %xmm5, %xmm6 +; SSE41-NEXT: movaps %xmm3, %xmm5 +; SSE41-NEXT: maxps %xmm6, %xmm5 ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: cmpunordps %xmm3, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: movaps %xmm4, %xmm3 ; SSE41-NEXT: movaps %xmm4, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 @@ -749,13 +749,13 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movaps %xmm6, %xmm1 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movaps %xmm5, %xmm1 ; SSE41-NEXT: maxps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm6, %xmm0 -; SSE41-NEXT: cmpunordps %xmm6, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movaps %xmm5, %xmm0 +; SSE41-NEXT: cmpunordps %xmm5, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: movd %xmm1, %eax ; SSE41-NEXT: testl %eax, %eax @@ -1279,15 +1279,15 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-LABEL: test_v8f64: ; SSE41: # %bb.0: ; SSE41-NEXT: movapd %xmm0, %xmm4 -; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm1, %xmm6 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: maxpd %xmm5, %xmm6 +; SSE41-NEXT: movapd %xmm3, %xmm5 +; SSE41-NEXT: maxpd %xmm6, %xmm5 ; SSE41-NEXT: movapd %xmm3, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 ; SSE41-NEXT: movapd %xmm4, %xmm3 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 @@ -1299,13 +1299,13 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: movapd %xmm5, %xmm1 ; SSE41-NEXT: maxpd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm5, %xmm0 +; SSE41-NEXT: cmpunordpd %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movapd %xmm1, %xmm2 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: movq %xmm1, %rax @@ -1579,15 +1579,15 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movapd %xmm1, %xmm8 ; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm9 +; SSE41-NEXT: movapd %xmm3, %xmm10 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm10 -; SSE41-NEXT: maxpd %xmm9, %xmm10 +; SSE41-NEXT: movapd %xmm7, %xmm9 +; SSE41-NEXT: maxpd %xmm10, %xmm9 ; SSE41-NEXT: movapd %xmm7, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 ; SSE41-NEXT: movapd %xmm8, %xmm7 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 @@ -1599,13 +1599,13 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: movapd %xmm3, %xmm5 ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 -; SSE41-NEXT: movapd %xmm10, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 +; SSE41-NEXT: movapd %xmm9, %xmm3 ; SSE41-NEXT: maxpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm10, %xmm0 -; SSE41-NEXT: cmpunordpd %xmm10, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 +; SSE41-NEXT: movapd %xmm9, %xmm0 +; SSE41-NEXT: cmpunordpd %xmm9, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 ; SSE41-NEXT: movapd %xmm2, %xmm5 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index e622899c8de7a..5ae9e552d0dcd 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -69,63 +69,61 @@ define float @test_v2f32(<2 x float> %a0) { define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: cmpunordss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: minss %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: minss %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: andnps %xmm3, %xmm1 -; SSE2-NEXT: orps %xmm4, %xmm1 +; SSE2-NEXT: andnps %xmm3, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: minss %xmm1, %xmm3 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: minss %xmm0, %xmm3 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: andnps %xmm3, %xmm4 -; SSE2-NEXT: andps %xmm2, %xmm1 -; SSE2-NEXT: orps %xmm4, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: minss %xmm1, %xmm2 -; SSE2-NEXT: cmpunordss %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm3 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: orps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: minss %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: andnps %xmm2, %xmm3 -; SSE2-NEXT: andps %xmm0, %xmm1 -; SSE2-NEXT: orps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: orps %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm0, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: minss %xmm0, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: minss %xmm1, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: andnps %xmm3, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: andnps %xmm3, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 ; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: minss %xmm1, %xmm3 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: minss %xmm0, %xmm3 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm4 ; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: minss %xmm1, %xmm2 -; SSE41-NEXT: cmpunordss %xmm1, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm0 +; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 ; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm0, %xmm1 -; SSE41-NEXT: orps %xmm3, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: andps %xmm1, %xmm0 +; SSE41-NEXT: orps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 0ec81c8077cd4..7f828fc293caa 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -843,7 +843,7 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k4 ; AVX512BW-NEXT: kmovw (%rdi), %k2 ; AVX512BW-NEXT: kandw %k4, %k2, %k3 -; AVX512BW-NEXT: kmovq %k4, %k6 +; AVX512BW-NEXT: kmovq %k4, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 @@ -855,16 +855,15 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-9, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovq %k3, %k4 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax @@ -917,22 +916,22 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 +; AVX512BW-NEXT: kshiftrd $4, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k6 +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $5, %k0, %k2 @@ -942,141 +941,142 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k4 ; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k6, %k2 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovq %k7, %k2 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $28, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $29, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $29, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $30, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $31, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $31, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k5 +; AVX512BW-NEXT: kandw %k2, %k1, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $22, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $22, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $23, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $23, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $25, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $25, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 @@ -1086,63 +1086,63 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $18, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $20, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -1152,128 +1152,128 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $10, %k0, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $10, %k0, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: korw %k2, %k4, %k2 ; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $12, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $12, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $13, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $13, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $15, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $6, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $7, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $7, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $8, %k0, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $9, %k0, %k0 ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 @@ -1475,9 +1475,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovq %k3, %k5 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: movw $-65, %ax @@ -1490,8 +1489,9 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-129, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovq %k3, %k5 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF @@ -1549,47 +1549,47 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k0, %k2 -; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 +; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $61, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -1598,12 +1598,12 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k6, %k1, %k1 @@ -1611,8 +1611,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -1620,235 +1620,234 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $54, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $55, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $57, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $48, %k0, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 +; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $50, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $51, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $52, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $42, %k0, %k3 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 +; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k3 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $44, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $45, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $46, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $47, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $37, %k0, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $38, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $39, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 @@ -1857,113 +1856,114 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $40, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $41, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $32, %k0, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 +; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $33, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $34, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $35, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $36, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 @@ -1981,44 +1981,45 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $28, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $29, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $30, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 @@ -2026,7 +2027,6 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 @@ -2035,8 +2035,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $21, %k0, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 @@ -2047,38 +2047,37 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $23, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $24, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 @@ -2096,7 +2095,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 @@ -2106,8 +2106,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kandw %k5, %k1, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 @@ -2115,34 +2114,34 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $17, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $18, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $19, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 @@ -2182,52 +2181,52 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $12, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $13, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $14, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 @@ -2244,58 +2243,58 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z} ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $6, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $7, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $8, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $9, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -3588,10 +3587,9 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd (%rdi), %k5 ; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 ; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k6 ; AVX512BW-NEXT: kmovw (%rdi), %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kandw %k6, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 @@ -3639,9 +3637,9 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovd %eax, %k7 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF @@ -3653,9 +3651,9 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF @@ -3671,9 +3669,9 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -3688,8 +3686,8 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 ; AVX512BW-NEXT: kshiftrd $28, %k5, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k3 +; AVX512BW-NEXT: kandw %k6, %k1, %k3 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload @@ -3722,15 +3720,15 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $31, %k5, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 @@ -3743,77 +3741,77 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $25, %k5, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k2 +; AVX512BW-NEXT: kshiftrd $25, %k5, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k6, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $26, %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $27, %k5, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -3830,7 +3828,8 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrd $23, %k5, %k7 ; AVX512BW-NEXT: kmovq %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 @@ -3852,8 +3851,7 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 @@ -3862,10 +3860,11 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 @@ -3878,77 +3877,76 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k7, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrd $20, %k0, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrd $21, %k0, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k3, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -3971,16 +3969,16 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 @@ -4024,53 +4022,53 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrd $12, %k0, %k3 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k3, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $15, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -4079,17 +4077,17 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $9, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $9, %k0, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 +; AVX512BW-NEXT: kandw %k1, %k2, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 @@ -4105,7 +4103,8 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k7, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4126,16 +4125,15 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k3, %k4, %k4 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $15, %k7, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 @@ -4144,63 +4142,63 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k2} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k4, %k4 +; AVX512BW-NEXT: kshiftlw $14, %k7, %k3 +; AVX512BW-NEXT: korw %k3, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z} ; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k4, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $7, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $8, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4208,69 +4206,69 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 +; AVX512BW-NEXT: kshiftrw $14, %k5, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $4, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $5, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 @@ -4309,41 +4307,41 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm8, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm7, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm9, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm6 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm8, %zmm8 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} @@ -4351,22 +4349,22 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 @@ -4375,33 +4373,33 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1152(%rdx) +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) @@ -4412,41 +4410,41 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 -; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm8, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm7, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm9, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm6 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm8, %zmm8 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} @@ -4454,22 +4452,22 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 @@ -4478,33 +4476,33 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1152(%rdx) +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) @@ -4640,8 +4638,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4650,12 +4648,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4699,11 +4697,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $8, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 @@ -4716,12 +4715,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4768,29 +4766,29 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $12, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4832,15 +4830,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4849,16 +4847,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -4896,32 +4893,32 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -4930,8 +4927,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4948,20 +4945,19 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4970,22 +4966,23 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -4994,11 +4991,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5007,20 +5005,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5037,12 +5035,10 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5051,57 +5047,59 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5110,20 +5108,19 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k6 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $29, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 @@ -5132,37 +5129,36 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $30, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5171,16 +5167,16 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5196,11 +5192,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5209,20 +5206,19 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5231,16 +5227,16 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -5261,53 +5257,53 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -5316,8 +5312,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5334,16 +5330,14 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5352,18 +5346,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5372,14 +5368,14 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 @@ -5389,24 +5385,23 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 @@ -5415,16 +5410,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5433,36 +5427,38 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k6 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z} -; AVX512BW-NEXT: kandw %k3, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5475,33 +5471,31 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5514,36 +5508,38 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5552,20 +5548,18 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5578,12 +5572,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5592,10 +5585,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -5603,28 +5597,30 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5645,14 +5641,13 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $55, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -5663,7 +5658,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5680,21 +5676,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5715,49 +5710,49 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5770,19 +5765,18 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 @@ -5792,23 +5786,25 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -5816,8 +5812,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -6389,15 +6384,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: movw $-17, %ax +; AVX512BW-NEXT: kmovd %eax, %k7 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax @@ -6455,16 +6450,16 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k4, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -6475,37 +6470,37 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k4 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $30, %k5, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -6517,12 +6512,12 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -6537,79 +6532,79 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 ; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kmovq %k5, %k1 -; AVX512BW-NEXT: kshiftrd $26, %k5, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 +; AVX512BW-NEXT: kshiftrd $26, %k5, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k1 +; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k0, %k6, %k6 -; AVX512BW-NEXT: kshiftrd $27, %k1, %k7 +; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kshiftrd $27, %k5, %k7 +; AVX512BW-NEXT: kmovq %k5, %k2 +; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $28, %k1, %k6 -; AVX512BW-NEXT: kmovq %k1, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrd $28, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k5, %k3 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: korw %k0, %k3, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kmovq %k4, %k0 -; AVX512BW-NEXT: kshiftrd $24, %k4, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k3 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $24, %k0, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 @@ -6617,16 +6612,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -6635,145 +6629,145 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k5, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k3 +; AVX512BW-NEXT: kshiftrw $2, %k4, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k5, %k2, %k1 +; AVX512BW-NEXT: korw %k4, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovq %k0, %k1 -; AVX512BW-NEXT: kshiftrd $21, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $21, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k0, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $22, %k1, %k4 ; AVX512BW-NEXT: kmovq %k1, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k4 ; AVX512BW-NEXT: kshiftrd $23, %k7, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 +; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: korw %k3, %k4, %k3 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z} ; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftrd $18, %k7, %k6 ; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $19, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $20, %k4, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 @@ -6790,104 +6784,107 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: korw %k0, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512BW-NEXT: kmovq %k4, %k0 ; AVX512BW-NEXT: kshiftrd $16, %k4, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k4, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kmovq %k4, %k0 -; AVX512BW-NEXT: kshiftrd $13, %k4, %k3 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $13, %k0, %k3 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k5, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k3 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 ; AVX512BW-NEXT: kmovq %k0, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 @@ -6897,68 +6894,67 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k3 ; AVX512BW-NEXT: kshiftrd $15, %k7, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $10, %k7, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kmovq %k7, %k2 +; AVX512BW-NEXT: kshiftrd $10, %k7, %k0 +; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k1, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $11, %k7, %k6 -; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -6973,56 +6969,55 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $12, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k5, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: korw %k0, %k4, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftrd $8, %k2, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 @@ -7052,7 +7047,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -7064,46 +7060,45 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $2, %k2, %k4 ; AVX512BW-NEXT: kmovq %k2, %k5 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $5, %k1, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k4, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 +; AVX512BW-NEXT: kshiftrd $5, %k1, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k6, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $6, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kandw %k0, %k3, %k3 @@ -7131,8 +7126,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovq %k2, %k6 -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 @@ -7140,34 +7135,33 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k3, %k4, %k3 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $3, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 @@ -7231,63 +7225,63 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm11, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm4, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm10, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm11, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm12, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm4, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm14, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm10, %zmm10 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm12, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm14, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 @@ -7296,8 +7290,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 @@ -7308,42 +7302,42 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -7351,63 +7345,63 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm11, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm10, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm11, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm12, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm4, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm14, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm10, %zmm10 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpermd %zmm0, %zmm12, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm14, %zmm4 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 @@ -7416,8 +7410,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 @@ -7428,42 +7422,42 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -7570,6 +7564,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovq %k5, %k3 ; AVX512BW-NEXT: kshiftrq $3, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 @@ -7578,27 +7573,27 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftrq $4, %k3, %k1 +; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -7606,8 +7601,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -7804,8 +7799,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 @@ -7865,11 +7860,10 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovq %k4, %k3 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -7911,15 +7905,16 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -7933,7 +7928,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 +; AVX512BW-NEXT: kmovq %k7, %k4 +; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 @@ -7955,7 +7951,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $19, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $19, %k4, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -7963,8 +7960,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -7981,7 +7978,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $20, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8005,7 +8002,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $21, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $21, %k7, %k1 +; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8024,10 +8022,10 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovq %k3, %k5 +; AVX512BW-NEXT: kshiftrq $22, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -8056,26 +8054,27 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $24, %k5, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -8233,8 +8232,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -8246,28 +8245,28 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $32, %k2, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -8291,87 +8290,89 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $33, %k2, %k1 +; AVX512BW-NEXT: kmovq %k2, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovq %k7, %k5 +; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $36, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -8382,12 +8383,12 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $37, %k7, %k1 +; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8395,62 +8396,64 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z} -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 +; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kshiftrq $38, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $39, %k5, %k6 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $39, %k7, %k6 +; AVX512BW-NEXT: kmovq %k7, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -8463,99 +8466,102 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $42, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftrq $43, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $44, %k5, %k1 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $44, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -8576,7 +8582,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $45, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8584,39 +8590,39 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8633,43 +8639,44 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8677,12 +8684,12 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -8693,7 +8700,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $50, %k5, %k1 +; AVX512BW-NEXT: kmovq %k7, %k5 +; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 @@ -8716,31 +8724,32 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $52, %k5, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $52, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8752,19 +8761,20 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $53, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8780,54 +8790,53 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $54, %k5, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $54, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $55, %k5, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $55, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -8835,6 +8844,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $56, %k5, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k1 @@ -8845,44 +8855,44 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 @@ -8912,18 +8922,20 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8932,8 +8944,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -8944,12 +8955,11 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8970,8 +8980,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8980,48 +8989,49 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $63, %k5, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) @@ -9685,7 +9695,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k6 +; AVX512BW-NEXT: kmovq %k2, %k3 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 @@ -9694,7 +9704,6 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-9, %ax @@ -9725,8 +9734,8 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovd (%rdi), %k3 -; AVX512BW-NEXT: kshiftrd $1, %k3, %k0 +; AVX512BW-NEXT: kmovd (%rdi), %k6 +; AVX512BW-NEXT: kshiftrd $1, %k6, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 @@ -9771,7 +9780,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $2, %k3, %k2 +; AVX512BW-NEXT: kshiftrd $2, %k6, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftlw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 @@ -9781,357 +9790,362 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k1, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $29, %k3, %k1 +; AVX512BW-NEXT: kmovq %k6, %k2 +; AVX512BW-NEXT: kshiftrd $29, %k6, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k0 +; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k2 -; AVX512BW-NEXT: kshiftrd $30, %k3, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k2 -; AVX512BW-NEXT: kshiftrd $31, %k3, %k4 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k4, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovq %k4, %k6 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $30, %k2, %k1 +; AVX512BW-NEXT: kmovq %k2, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k3 +; AVX512BW-NEXT: kshiftrd $31, %k4, %k0 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k0, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $27, %k3, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k3, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k0, %k1 +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: kshiftrd $27, %k4, %k1 +; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 +; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k3, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k7 +; AVX512BW-NEXT: kshiftrw $13, %k3, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k7 -; AVX512BW-NEXT: kshiftrd $28, %k3, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k3, %k7 +; AVX512BW-NEXT: korw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k7 +; AVX512BW-NEXT: kshiftrd $28, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k7, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovq %k2, %k4 +; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: kmovq %k3, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 ; AVX512BW-NEXT: korw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} +; AVX512BW-NEXT: korw %k7, %k0, %k2 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $25, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k0 -; AVX512BW-NEXT: korw %k0, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k5 -; AVX512BW-NEXT: kshiftrd $26, %k6, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $26, %k6, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k2 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k2} {z} +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $23, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $22, %k2, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $23, %k2, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 +; AVX512BW-NEXT: kshiftrd $22, %k2, %k5 +; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovq %k2, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k5 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k2 -; AVX512BW-NEXT: kshiftrd $24, %k6, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $24, %k6, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $20, %k6, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $20, %k3, %k5 +; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k5 -; AVX512BW-NEXT: kshiftrd $21, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 +; AVX512BW-NEXT: kandw %k0, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $21, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k5 +; AVX512BW-NEXT: kandw %k0, %k2, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k1, %k4, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k7, %k1 +; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $18, %k7, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $18, %k2, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: kmovq %k1, %k2 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $19, %k7, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $19, %k2, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -10139,99 +10153,98 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k0, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 -; AVX512BW-NEXT: kmovq %k2, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: kmovq %k3, %k7 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k2} {z} +; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k5, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $16, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k2 +; AVX512BW-NEXT: korw %k0, %k3, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k3 ; AVX512BW-NEXT: kshiftrd $17, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: korw %k0, %k3, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload @@ -10245,88 +10258,90 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $13, %k0, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k1 +; AVX512BW-NEXT: kandw %k6, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k1, %k3 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 ; AVX512BW-NEXT: kmovq %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $15, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k5 +; AVX512BW-NEXT: kshiftrd $15, %k6, %k3 +; AVX512BW-NEXT: kmovq %k6, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k5, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $11, %k2, %k6 -; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovq %k0, %k3 +; AVX512BW-NEXT: kshiftrd $11, %k0, %k0 +; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kandw %k1, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $12, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $12, %k3, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -10334,301 +10349,301 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k6 -; AVX512BW-NEXT: kmovq %k4, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k0, %k4, %k4 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k4} {z} +; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: kmovq %k3, %k0 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $9, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k1, %k4, %k5 -; AVX512BW-NEXT: kshiftrd $10, %k6, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $10, %k6, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $7, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $6, %k3, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $7, %k4, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 +; AVX512BW-NEXT: kshiftrd $6, %k4, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k7, %k2, %k4 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k4 -; AVX512BW-NEXT: kshiftrd $8, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $8, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k4, %k4 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $4, %k6, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $4, %k6, %k3 +; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 +; AVX512BW-NEXT: kandw %k0, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k5 -; AVX512BW-NEXT: kshiftrd $5, %k6, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $5, %k6, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k5 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $14, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k4 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $3, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 @@ -10661,274 +10676,274 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64: ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: movw $1, %ax ; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm15, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm15, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm25 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm26 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm27 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm15, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm11, %zmm11, %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm15, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm16, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm17, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm18, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm19, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm5, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm13, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm15, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm16, %zmm26 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm17, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm18, %zmm28 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm5, %zmm29 +; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm19, %zmm30 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm31 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm16, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm18, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm19, %zmm5 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm27 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm26 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm25 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm23 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm19 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm18 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1728(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1664(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1600(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1536(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1728(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1664(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1600(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 256(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor7_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: movw $1, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm15, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm26 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm27 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm15, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm15, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm16, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm5, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm13, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm15, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm16, %zmm26 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm17, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm18, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm5, %zmm29 +; AVX512DQ-NEXT: vpermd %zmm11, %zmm19, %zmm30 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm31 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm5 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm27 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm26 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm25 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm23 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm19 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm18 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1728(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1664(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1600(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1536(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1728(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1664(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1600(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -10944,8 +10959,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovq %k2, %k3 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-9, %ax @@ -10962,9 +10978,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-33, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k4 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-65, %ax @@ -10977,8 +10992,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq (%rdi), %k3 -; AVX512BW-NEXT: kshiftrq $1, %k3, %k0 +; AVX512BW-NEXT: kmovq (%rdi), %k4 +; AVX512BW-NEXT: kshiftrq $1, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 @@ -10995,9 +11010,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF @@ -11013,16 +11028,16 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $2, %k3, %k1 +; AVX512BW-NEXT: kshiftrq $2, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11034,22 +11049,22 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftrq $3, %k3, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftrq $3, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11057,24 +11072,23 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11083,11 +11097,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -11097,108 +11112,109 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $5, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $6, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 +; AVX512BW-NEXT: kshiftrq $6, %k7, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k0, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $7, %k7, %k0 +; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $7, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $8, %k4, %k0 +; AVX512BW-NEXT: kmovq %k4, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -11206,18 +11222,17 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $9, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} @@ -11229,11 +11244,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11242,20 +11257,20 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $10, %k2, %k0 +; AVX512BW-NEXT: kshiftrq $10, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 @@ -11265,25 +11280,26 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $11, %k2, %k6 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $11, %k5, %k6 +; AVX512BW-NEXT: kmovq %k5, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -11294,10 +11310,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftrq $12, %k2, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftrq $12, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11313,42 +11328,43 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $13, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $13, %k7, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kandw %k5, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 +; AVX512BW-NEXT: korw %k1, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload @@ -11375,34 +11391,34 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $15, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 @@ -11451,10 +11467,10 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11471,8 +11487,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11483,8 +11499,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11523,19 +11539,19 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kandw %k5, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 @@ -11548,15 +11564,15 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11573,22 +11589,22 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $22, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -11596,9 +11612,9 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $23, %k2, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $23, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11610,8 +11626,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -11622,34 +11638,32 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $24, %k2, %k0 +; AVX512BW-NEXT: kshiftrq $24, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -11658,12 +11672,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $25, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11673,7 +11687,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -11691,34 +11706,33 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $27, %k7, %k6 -; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -11729,18 +11743,18 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $28, %k4, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $28, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11751,50 +11765,50 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $29, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $29, %k7, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 +; AVX512BW-NEXT: korw %k1, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $30, %k7, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $30, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11802,54 +11816,54 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $31, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -11859,19 +11873,18 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 @@ -11882,33 +11895,33 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 ; AVX512BW-NEXT: kmovq %k7, %k3 +; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11933,12 +11946,13 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 ; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 +; AVX512BW-NEXT: kmovq %k3, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -11949,29 +11963,29 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $36, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -12007,31 +12021,32 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kandw %k5, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $38, %k7, %k0 +; AVX512BW-NEXT: kmovq %k7, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -12039,10 +12054,10 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $39, %k7, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrq $39, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12057,91 +12072,92 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $40, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $41, %k3, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $41, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} -; AVX512BW-NEXT: kandw %k5, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $42, %k3, %k0 +; AVX512BW-NEXT: kshiftrq $42, %k4, %k0 +; AVX512BW-NEXT: kmovq %k4, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12149,8 +12165,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 @@ -12166,15 +12182,16 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -12200,44 +12217,42 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovq %k4, %k5 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $45, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $45, %k7, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 +; AVX512BW-NEXT: korw %k1, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z} +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $46, %k7, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $46, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12260,38 +12275,38 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $47, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 @@ -12302,42 +12317,42 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k0 ; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -12350,7 +12365,6 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -12362,12 +12376,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -12376,60 +12389,61 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $51, %k2, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $51, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $52, %k2, %k6 +; AVX512BW-NEXT: kshiftrq $52, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $53, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -12442,37 +12456,37 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $54, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -12483,21 +12497,22 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $55, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k5, %k1, %k1 @@ -12507,43 +12522,42 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $56, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $57, %k2, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $57, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k7} {z} @@ -12555,24 +12569,26 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k2, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftrq $58, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -12583,25 +12599,24 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $59, %k2, %k6 +; AVX512BW-NEXT: kshiftrq $59, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -12623,15 +12638,14 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k5 -; AVX512BW-NEXT: kshiftrq $60, %k2, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $60, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12643,39 +12657,39 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $61, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k6 +; AVX512BW-NEXT: kshiftrq $61, %k5, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 +; AVX512BW-NEXT: korw %k1, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $62, %k5, %k0 @@ -12702,7 +12716,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 @@ -12711,14 +12726,13 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftlw $15, %k5, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -13119,26 +13133,26 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32: ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm6 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm8 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm10 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm12 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm14 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm16, %zmm16, %zmm16 {%k1} {z} -; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm2, %zmm2 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 @@ -13160,10 +13174,10 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 @@ -13176,17 +13190,17 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 960(%rdx) +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 832(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) @@ -13200,26 +13214,26 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-LABEL: mask_replication_factor8_vf32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm1, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm16, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm16, %zmm2, %zmm2 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 @@ -13241,10 +13255,10 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 @@ -13257,17 +13271,17 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 960(%rdx) +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 832(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) @@ -13351,303 +13365,319 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64: ; AVX512F-ONLY: # %bb.0: +; AVX512F-ONLY-NEXT: subq $136, %rsp ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm10, %zmm10, %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm16 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm17, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm25 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm26 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm27 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm28 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm11, %zmm29 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm13, %zmm30 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm15, %zmm31 -; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm17, %zmm2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm17, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm12, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm14, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm16, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm22, %zmm5 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm24, %zmm7 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm26, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm12, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm14, %zmm28 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm16, %zmm29 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm18, %zmm30 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm20, %zmm31 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm22, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm24, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm26, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm8 +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm30 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm29 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm28 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm27 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm26 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm25 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm23 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm21 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm19 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm16 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm14 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm12 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1856(%rsi), %zmm8 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1920(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm31, 1984(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1920(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1856(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1792(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1728(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1664(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1600(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 1088(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 1920(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 1856(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 1792(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1728(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1664(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1600(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 1536(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 1024(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512F-ONLY-NEXT: addq $136, %rsp ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor8_vf64: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: subq $136, %rsp ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10 ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm17, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm26 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm27 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm28 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm11, %zmm29 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm13, %zmm30 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm15, %zmm31 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm2 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm17, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm22, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm24, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm12, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm14, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm16, %zmm29 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm18, %zmm30 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm20, %zmm31 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm22, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm26, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm8 +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm30 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm29 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm28 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm27 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm26 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm25 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm23 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm21 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm20 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm19 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 +; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm16 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm14 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm12 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1856(%rsi), %zmm8 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1920(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 +; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 +; AVX512DQ-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z} +; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 +; AVX512DQ-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 ; AVX512DQ-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1984(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1920(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1856(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1792(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1728(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1664(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1600(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 1088(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm28, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1920(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1856(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1792(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1600(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1536(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1024(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rdx) +; AVX512DQ-NEXT: addq $136, %rsp ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -13661,75 +13691,75 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm17 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm14 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm16 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm15 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm9 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm2, %k2 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm5, %k2 +; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm10 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm9, %k2 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm15, %k2 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm14, %k1 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm14 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm16, %k1 +; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm16 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm17, %k2 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm17 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 +; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm12, %k1 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm7, %k1 +; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm7, %k2 -; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k1} {z} -; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} @@ -13748,23 +13778,23 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 768(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 768(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll index 4e90e4c5fa4da..9294ae48a7695 100644 --- a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll +++ b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll @@ -169,70 +169,70 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm11 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm12 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm13 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm14 = mem[0],zero ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm9, %xmm9 -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm13 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm15 ; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm9, %xmm9 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm10, %xmm10 -; AVX1-NEXT: vpmovsxbd %xmm10, %xmm14 -; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm10, %xmm10 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm11, %xmm11 -; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-NEXT: # xmm15 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm0 -; AVX1-NEXT: vpslld %xmm15, %xmm0, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm12 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm10, %xmm9 +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm9, %xmm10 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm13, %xmm13 +; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm9 +; AVX1-NEXT: vpslld %xmm0, %xmm9, %xmm1 ; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm13, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm11, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm11, %xmm11 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm12 -; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm13 -; AVX1-NEXT: vpslld %xmm15, %xmm13, %xmm15 -; AVX1-NEXT: vpslld %xmm2, %xmm13, %xmm2 -; AVX1-NEXT: vpmovsxbd %xmm12, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm12, %xmm12 -; AVX1-NEXT: vblendvps %xmm9, %xmm15, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm9 +; AVX1-NEXT: vpslld %xmm2, %xmm9, %xmm9 +; AVX1-NEXT: vblendvps %xmm15, %xmm1, %xmm9, %xmm9 +; AVX1-NEXT: vpmovsxbd %xmm13, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm13, %xmm13 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm14, %xmm14 +; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm15 +; AVX1-NEXT: vpslld %xmm0, %xmm15, %xmm0 +; AVX1-NEXT: vpslld %xmm2, %xmm15, %xmm2 +; AVX1-NEXT: vpmovsxbd %xmm14, %xmm15 +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm14, %xmm14 +; AVX1-NEXT: vblendvps %xmm12, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpslld %xmm3, %xmm9, %xmm15 -; AVX1-NEXT: vpslld %xmm4, %xmm9, %xmm9 -; AVX1-NEXT: vblendvps %xmm14, %xmm15, %xmm9, %xmm9 -; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm3, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm4, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm10, %xmm15, %xmm14, %xmm10 -; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm1, %xmm15, %xmm14, %xmm1 -; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm5, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm6, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm11, %xmm15, %xmm14, %xmm11 -; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm13, %xmm15, %xmm14, %xmm13 -; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm14 -; AVX1-NEXT: vpslld %xmm7, %xmm14, %xmm15 -; AVX1-NEXT: vpslld %xmm8, %xmm14, %xmm14 -; AVX1-NEXT: vblendvps %xmm12, %xmm15, %xmm14, %xmm12 -; AVX1-NEXT: vmovups %xmm0, (%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm2, 16(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm9, 32(%rdi,%rcx,4) +; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm12 +; AVX1-NEXT: vpslld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vblendvps %xmm11, %xmm12, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm11 +; AVX1-NEXT: vpslld %xmm3, %xmm11, %xmm12 +; AVX1-NEXT: vpslld %xmm4, %xmm11, %xmm11 +; AVX1-NEXT: vblendvps %xmm10, %xmm12, %xmm11, %xmm10 +; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm11 +; AVX1-NEXT: vpslld %xmm5, %xmm11, %xmm12 +; AVX1-NEXT: vpslld %xmm6, %xmm11, %xmm11 +; AVX1-NEXT: vblendvps %xmm1, %xmm12, %xmm11, %xmm1 +; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm11 +; AVX1-NEXT: vpslld %xmm5, %xmm11, %xmm12 +; AVX1-NEXT: vpslld %xmm6, %xmm11, %xmm11 +; AVX1-NEXT: vblendvps %xmm13, %xmm12, %xmm11, %xmm11 +; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm12 +; AVX1-NEXT: vpslld %xmm7, %xmm12, %xmm13 +; AVX1-NEXT: vpslld %xmm8, %xmm12, %xmm12 +; AVX1-NEXT: vblendvps %xmm15, %xmm13, %xmm12, %xmm12 +; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm13 +; AVX1-NEXT: vpslld %xmm7, %xmm13, %xmm15 +; AVX1-NEXT: vpslld %xmm8, %xmm13, %xmm13 +; AVX1-NEXT: vblendvps %xmm14, %xmm15, %xmm13, %xmm13 +; AVX1-NEXT: vmovups %xmm9, (%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm0, 16(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm2, 32(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm10, 48(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm1, 64(%rdi,%rcx,4) ; AVX1-NEXT: vmovups %xmm11, 80(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm13, 96(%rdi,%rcx,4) -; AVX1-NEXT: vmovups %xmm12, 112(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm12, 96(%rdi,%rcx,4) +; AVX1-NEXT: vmovups %xmm13, 112(%rdi,%rcx,4) ; AVX1-NEXT: addq $32, %rcx ; AVX1-NEXT: cmpq %rcx, %rdx ; AVX1-NEXT: jne .LBB0_4 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll index 727b3ff2eb45c..3ca0e2121e0d1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -431,7 +431,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movd %eax, %xmm6 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax @@ -443,7 +443,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: andl $15, %ecx ; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSE2-NEXT: movd %eax, %xmm9 @@ -473,10 +473,10 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] @@ -484,7 +484,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -515,7 +515,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax @@ -527,7 +527,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: andl $15, %ecx ; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSSE3-NEXT: movd %eax, %xmm9 @@ -557,10 +557,10 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] @@ -568,7 +568,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -850,7 +850,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: andl $15, %r12d ; SSE2-NEXT: movzbl -24(%rsp,%r12), %eax -; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movd %eax, %xmm6 ; SSE2-NEXT: andl $15, %r15d ; SSE2-NEXT: movzbl -24(%rsp,%r15), %eax ; SSE2-NEXT: movd %eax, %xmm7 @@ -859,7 +859,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: movd %eax, %xmm8 ; SSE2-NEXT: andl $15, %ebx ; SSE2-NEXT: movzbl -24(%rsp,%rbx), %eax -; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %eax, %xmm5 ; SSE2-NEXT: andl $15, %r11d ; SSE2-NEXT: movzbl -24(%rsp,%r11), %eax ; SSE2-NEXT: movd %eax, %xmm9 @@ -868,10 +868,10 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: movd %eax, %xmm10 ; SSE2-NEXT: andl $15, %r9d ; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax -; SSE2-NEXT: movd %eax, %xmm11 +; SSE2-NEXT: movd %eax, %xmm12 ; SSE2-NEXT: andl $15, %r8d ; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax -; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: movd %eax, %xmm11 ; SSE2-NEXT: andl $15, %esi ; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm13 @@ -888,18 +888,18 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -948,7 +948,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: andl $15, %r12d ; SSSE3-NEXT: movzbl -24(%rsp,%r12), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: andl $15, %r15d ; SSSE3-NEXT: movzbl -24(%rsp,%r15), %eax ; SSSE3-NEXT: movd %eax, %xmm7 @@ -957,7 +957,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: movd %eax, %xmm8 ; SSSE3-NEXT: andl $15, %ebx ; SSSE3-NEXT: movzbl -24(%rsp,%rbx), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm5 ; SSSE3-NEXT: andl $15, %r11d ; SSSE3-NEXT: movzbl -24(%rsp,%r11), %eax ; SSSE3-NEXT: movd %eax, %xmm9 @@ -966,10 +966,10 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: andl $15, %r9d ; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax -; SSSE3-NEXT: movd %eax, %xmm11 +; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: andl $15, %r8d ; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: andl $15, %esi ; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm13 @@ -986,18 +986,18 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index fbf9187df4817..2f3fdeb74dc47 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -4963,15 +4963,15 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SKX-NEXT: vpextrd $1, %xmm1, %r9d ; SKX-NEXT: movw %r9w, 27(%rdi) ; SKX-NEXT: vmovd %xmm1, %r8d -; SKX-NEXT: vpextrd $3, %xmm0, %esi +; SKX-NEXT: vpextrd $3, %xmm0, %edx ; SKX-NEXT: movw %r8w, 24(%rdi) -; SKX-NEXT: movw %si, 9(%rdi) -; SKX-NEXT: vpextrd $2, %xmm0, %edx -; SKX-NEXT: vpextrd $1, %xmm0, %ecx -; SKX-NEXT: movw %dx, 6(%rdi) -; SKX-NEXT: movw %cx, 3(%rdi) -; SKX-NEXT: vmovd %xmm0, %eax -; SKX-NEXT: movw %ax, (%rdi) +; SKX-NEXT: movw %dx, 9(%rdi) +; SKX-NEXT: vpextrd $2, %xmm0, %esi +; SKX-NEXT: vpextrd $1, %xmm0, %eax +; SKX-NEXT: movw %si, 6(%rdi) +; SKX-NEXT: movw %ax, 3(%rdi) +; SKX-NEXT: vmovd %xmm0, %ecx +; SKX-NEXT: movw %cx, (%rdi) ; SKX-NEXT: shrl $16, %r15d ; SKX-NEXT: movb %r15b, 47(%rdi) ; SKX-NEXT: shrl $16, %r14d @@ -4997,14 +4997,14 @@ define void @trunc_ssat_v16i32_v16i24(<16 x i32> %x, ptr %y) nounwind { ; SKX-NEXT: movw %r9w, 15(%rdi) ; SKX-NEXT: vmovd %xmm0, %r8d ; SKX-NEXT: movw %r8w, 12(%rdi) -; SKX-NEXT: shrl $16, %esi -; SKX-NEXT: movb %sil, 11(%rdi) ; SKX-NEXT: shrl $16, %edx -; SKX-NEXT: movb %dl, 8(%rdi) -; SKX-NEXT: shrl $16, %ecx -; SKX-NEXT: movb %cl, 5(%rdi) +; SKX-NEXT: movb %dl, 11(%rdi) +; SKX-NEXT: shrl $16, %esi +; SKX-NEXT: movb %sil, 8(%rdi) ; SKX-NEXT: shrl $16, %eax -; SKX-NEXT: movb %al, 2(%rdi) +; SKX-NEXT: movb %al, 5(%rdi) +; SKX-NEXT: shrl $16, %ecx +; SKX-NEXT: movb %cl, 2(%rdi) ; SKX-NEXT: shrl $16, %r11d ; SKX-NEXT: movb %r11b, 23(%rdi) ; SKX-NEXT: shrl $16, %r10d diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 1719e2588db9e..e717ede16a9f0 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -849,8 +849,8 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; ; SSE41-LABEL: trunc_usat_v4i64_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm5, %xmm0 @@ -862,18 +862,18 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: movapd %xmm2, %xmm7 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm2, %xmm5 +; SSE41-NEXT: pxor %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] ; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE41-NEXT: pand %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: packusdw %xmm7, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i16: diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll index c785db8879d49..a2affbd8728c2 100644 --- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll +++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll @@ -39,10 +39,10 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1 ; X86-NEXT: kmovw %k0, %eax ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload -; X86-NEXT: kmovw %k0, %ecx +; X86-NEXT: kmovw %k0, %edx ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload -; X86-NEXT: kmovw %k0, %edx +; X86-NEXT: kmovw %k0, %ecx ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload ; X86-NEXT: kmovw %k0, %edi @@ -50,11 +50,11 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1 ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload ; X86-NEXT: kmovw %k2, %edi -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: kmovw %k1, %ecx -; X86-NEXT: addl %edi, %ecx -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: kmovw %k1, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: addl %edx, %eax +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movw %ax, (%esi) ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 132a6beca8e95..d6716d0edff40 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1316,24 +1316,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $72, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: movl (%edx), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SSE2-NEXT: movl (%edi), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%edx), %ecx +; X86-SSE2-NEXT: movl 4(%edi), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%edx), %edi -; X86-SSE2-NEXT: movl 12(%edx), %ebx -; X86-SSE2-NEXT: movl 16(%edx), %ebp +; X86-SSE2-NEXT: movl 8(%edi), %esi +; X86-SSE2-NEXT: movl 12(%edi), %ebx +; X86-SSE2-NEXT: movl 16(%edi), %ebp ; X86-SSE2-NEXT: movzbl (%eax), %eax -; X86-SSE2-NEXT: movl 20(%edx), %esi -; X86-SSE2-NEXT: movl 24(%edx), %ecx -; X86-SSE2-NEXT: movl 28(%edx), %edx -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl 20(%edi), %edx +; X86-SSE2-NEXT: movl 24(%edi), %ecx +; X86-SSE2-NEXT: movl 28(%edi), %edi +; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1348,20 +1348,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andb $31, %al ; X86-SSE2-NEXT: negb %al -; X86-SSE2-NEXT: movsbl %al, %eax -; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 48(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 56(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 68(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx +; X86-SSE2-NEXT: movsbl %al, %edx +; X86-SSE2-NEXT: movl 40(%esp,%edx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 44(%esp,%edx), %eax +; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-SSE2-NEXT: movl 52(%esp,%edx), %esi +; X86-SSE2-NEXT: movl 48(%esp,%edx), %edi +; X86-SSE2-NEXT: movl 60(%esp,%edx), %ebx +; X86-SSE2-NEXT: movl 56(%esp,%edx), %ebp +; X86-SSE2-NEXT: movl 68(%esp,%edx), %ecx +; X86-SSE2-NEXT: movl 64(%esp,%edx), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl %ebp, 16(%eax) ; X86-SSE2-NEXT: movl %ebx, 20(%eax) ; X86-SSE2-NEXT: movl %edi, 8(%eax) diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 03cea0e0de6bf..24475360cbbc4 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -658,15 +658,15 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -676,27 +676,27 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -732,29 +732,29 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -788,26 +788,26 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ebx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -974,15 +974,15 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -993,27 +993,27 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1049,21 +1049,21 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax @@ -1071,7 +1071,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1106,27 +1106,27 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%esi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ebx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%esi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%esi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 12(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1289,46 +1289,46 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -1365,29 +1365,29 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1422,26 +1422,26 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ebx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1642,24 +1642,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -1775,24 +1775,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1822,26 +1822,26 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1849,20 +1849,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1882,30 +1882,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1914,66 +1914,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 20(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %ebx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1987,30 +1989,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2022,55 +2024,57 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2201,29 +2205,29 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes: @@ -2277,24 +2281,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -2315,7 +2319,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi @@ -2326,19 +2330,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx @@ -2348,46 +2350,46 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) @@ -2412,24 +2414,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -2447,29 +2449,29 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx @@ -2521,30 +2523,30 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2555,66 +2557,64 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ecx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 84(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 84(%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) @@ -2633,26 +2633,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -2670,56 +2670,58 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %ebx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2921,40 +2923,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch @@ -3057,40 +3057,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl @@ -3108,26 +3106,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3135,20 +3133,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3167,103 +3165,104 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 20(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 20(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 28(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 24(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -3277,91 +3276,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -3535,10 +3536,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx @@ -3553,10 +3554,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 @@ -3571,9 +3572,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 @@ -3621,54 +3622,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r15, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -3715,42 +3716,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r10, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r15, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rdi,%rdi), %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r13, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r13, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r12, %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r13, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -4184,15 +4185,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -4207,13 +4208,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -4228,7 +4229,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) @@ -4276,20 +4277,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -4311,10 +4312,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -4331,146 +4330,146 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 76(%esp,%ebx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%ebx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 40(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -4608,19 +4607,19 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ebp, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx @@ -4634,13 +4633,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4651,12 +4650,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4723,18 +4722,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi ; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx @@ -4786,15 +4785,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 @@ -4942,55 +4941,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r10d -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r8d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r8d ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %r15, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %r11, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 40(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -5039,45 +5038,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r8, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %rbx, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %r13, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %rbp, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r14, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r12, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r11, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r14, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -5430,11 +5429,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5451,12 +5450,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx @@ -5466,67 +5465,67 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx @@ -5544,18 +5543,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 52(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5567,7 +5566,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) @@ -5595,41 +5594,41 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $216, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5642,8 +5641,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5652,12 +5649,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5674,106 +5672,106 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %esi, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 212(%esp,%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 212(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edi @@ -5803,9 +5801,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 36(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 28(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) @@ -5840,13 +5838,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax @@ -5856,10 +5855,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx @@ -5870,8 +5868,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5883,13 +5882,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax @@ -5916,64 +5914,64 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -5984,32 +5982,32 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 44(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) @@ -6017,7 +6015,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -6209,10 +6207,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx @@ -6227,10 +6225,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 @@ -6245,9 +6243,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 @@ -6296,54 +6294,54 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r15, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -6391,42 +6389,42 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r10, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r15, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rdi,%rdi), %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rbx, %r13, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r13, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r12, %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r13, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 @@ -6862,15 +6860,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -6885,13 +6883,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload @@ -6906,7 +6904,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) @@ -6954,23 +6952,21 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -7008,149 +7004,148 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 76(%esp,%ebx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%ebx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 60(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 40(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -7164,7 +7159,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -7241,119 +7236,117 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 40(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 32(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 40(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 32(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 7bd110748d55b..aaba44c8dc111 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1072,109 +1072,109 @@ ret void define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind { ; AVX1-LABEL: interleaved_store_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rax +; AVX1-NEXT: subq $24, %rsp ; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %ymm4, %ymm5 +; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovdqa %ymm2, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm12 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7 -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm7 -; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm13 -; AVX1-NEXT: vpor %xmm7, %xmm13, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm10, %xmm8, %xmm13 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm14 -; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm15 -; AVX1-NEXT: vpor %xmm13, %xmm15, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX1-NEXT: vpshufb %xmm15, %xmm0, %xmm15 -; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm10 -; AVX1-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-NEXT: vmovdqa %ymm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 +; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7 +; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm0 +; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm7 +; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm8 +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm10 -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] -; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm2, %xmm12, %xmm12 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm12 -; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] -; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] -; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX1-NEXT: # xmm8 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX1-NEXT: vpor %xmm14, %xmm10, %xmm10 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm7, %xmm9 -; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX1-NEXT: vpshufb %xmm9, %xmm12, %xmm12 -; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm7 -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm2 -; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm15 +; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm10 +; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm10 +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm9 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm8 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm13 +; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm5, %xmm11, %xmm11 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm11 +; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm0, %xmm15, %xmm15 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] +; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm12 +; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 +; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 +; AVX1-NEXT: vpshufb %xmm10, %xmm14, %xmm14 +; AVX1-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqu %xmm7, 80(%rdi) -; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm5, 16(%rdi) +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm10 +; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm6 +; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm11 +; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu %xmm6, 80(%rdi) +; AVX1-NEXT: vmovdqu %xmm9, 64(%rdi) +; AVX1-NEXT: vmovdqu %xmm8, 16(%rdi) ; AVX1-NEXT: vmovdqu %xmm4, (%rdi) -; AVX1-NEXT: vmovdqu %xmm12, 48(%rdi) -; AVX1-NEXT: vmovdqu %xmm8, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm6, 176(%rdi) +; AVX1-NEXT: vmovdqu %xmm10, 48(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi) +; AVX1-NEXT: vmovdqu %xmm2, 176(%rdi) ; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi) -; AVX1-NEXT: vmovdqu %xmm10, 112(%rdi) +; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi) ; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) ; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, 128(%rdi) -; AVX1-NEXT: popq %rax +; AVX1-NEXT: vmovdqu %xmm5, 128(%rdi) +; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1273,116 +1273,116 @@ ret void define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-LABEL: interleaved_load_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm9 -; AVX1-NEXT: vmovups 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqu (%rdi), %xmm11 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13 +; AVX1-NEXT: vmovups 64(%rdi), %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm10 -; AVX1-NEXT: vmovdqu 64(%rdi), %xmm3 ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 -; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6 +; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 144(%rdi), %xmm12 -; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7 -; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm8 -; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm11 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm15 +; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10 +; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] +; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6 +; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7 +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpor %xmm6, %xmm15, %xmm0 +; AVX1-NEXT: vpor %xmm5, %xmm12, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12 +; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm11 ; AVX1-NEXT: vmovdqa %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm1 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm12 +; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm11 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX1-NEXT: vpor %xmm9, %xmm15, %xmm6 -; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm9 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm10 -; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm9 -; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm6 -; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm9 -; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm3 -; AVX1-NEXT: vpshufb %xmm14, %xmm9, %xmm12 -; AVX1-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-NEXT: vmovdqu 176(%rdi), %xmm12 -; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm14, %xmm12, %xmm15 -; AVX1-NEXT: vpor %xmm1, %xmm15, %xmm1 -; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX1-NEXT: vmovdqu 128(%rdi), %xmm15 -; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm14 -; AVX1-NEXT: vpor %xmm13, %xmm14, %xmm14 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm13 +; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm13 +; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm13 +; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm10 +; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqu 176(%rdi), %xmm13 +; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm12 +; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm3 +; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm12 +; AVX1-NEXT: vmovdqu 128(%rdi), %xmm14 +; AVX1-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm15 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm13 -; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm13 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm14 -; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm14 -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12 +; AVX1-NEXT: vpor %xmm6, %xmm12, %xmm12 +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm15 +; AVX1-NEXT: vpor %xmm7, %xmm15, %xmm15 +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm11[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX1-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm10 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] ; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm10 -; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm10 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX1-NEXT: vpor %xmm12, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX1-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm11 +; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm9 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] ; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm10, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm9, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm3 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm10, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm3 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_load_vf64_i8_stride3: @@ -1489,64 +1489,64 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX1-LABEL: interleaved_store_vf64_i8_stride4: ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm11 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm13 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm13 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX1-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 ; AVX1-NEXT: vmovaps %ymm3, 224(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 192(%rdi) ; AVX1-NEXT: vmovaps %ymm1, 160(%rdi) -; AVX1-NEXT: vmovaps %ymm6, 128(%rdi) +; AVX1-NEXT: vmovaps %ymm4, 128(%rdi) ; AVX1-NEXT: vmovaps %ymm2, 96(%rdi) ; AVX1-NEXT: vmovaps %ymm9, 64(%rdi) -; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm6, 32(%rdi) ; AVX1-NEXT: vmovaps %ymm8, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll index 2517530ca28ec..6eb34b4e773e8 100644 --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -222,10 +222,10 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %eax, %ebp -; WIN32-NEXT: addl %esi, %ebp +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %edi @@ -235,9 +235,9 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: addl %edi, %esi ; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: addl %ebp, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ebp, %esi +; WIN32-NEXT: adcl %ecx, %esi ; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx @@ -570,14 +570,14 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl %eax, %ecx @@ -586,36 +586,35 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: imull %ebp, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: addl %edi, %ebx ; WIN32-NEXT: addl %eax, %ebx ; WIN32-NEXT: addl %ecx, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: adcl %esi, %ebx -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: addl %esi, %edi -; WIN32-NEXT: adcl $0, %ecx -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx +; WIN32-NEXT: adcl $0, %ebp +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %edi, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: adcl %ecx, %ebp +; WIN32-NEXT: addl %ecx, %esi +; WIN32-NEXT: adcl %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: setb %cl -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: addl %edi, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload @@ -628,9 +627,9 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) { ; WIN32-NEXT: jne LBB12_2 ; WIN32-NEXT: # %bb.1: ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: LBB12_2: -; WIN32-NEXT: movl %edi, %edx +; WIN32-NEXT: movl %ebp, %edx ; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi @@ -1001,42 +1000,42 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %eax, %ebx -; WIN32-NEXT: addl %esi, %ebx +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: imull %ebp, %edi -; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: addl %edi, %esi ; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: addl %ebx, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ebx, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %ecx ; WIN32-NEXT: addl %ebx, %ecx -; WIN32-NEXT: adcl $0, %ebp -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: adcl $0, %edi +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: addl %ecx, %ebx -; WIN32-NEXT: adcl %ebp, %edi +; WIN32-NEXT: adcl %edi, %ebp ; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %edi, %eax +; WIN32-NEXT: addl %ebp, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload @@ -1696,23 +1695,23 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $16, %esp +; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %edx -; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl 4(%eax), %esi -; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill +; WIN32-NEXT: movl (%eax), %ebx +; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl 4(%eax), %ebp ; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: movl %ecx, %edi ; WIN32-NEXT: sarl $31, %eax ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: imull %esi, %ecx -; WIN32-NEXT: mull %edx -; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: imull %ebp, %ecx +; WIN32-NEXT: mull %ebx +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: addl %ecx, %ebx -; WIN32-NEXT: movl %esi, %ecx +; WIN32-NEXT: movl %ebp, %ecx +; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi @@ -1721,35 +1720,36 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: addl %eax, %edi ; WIN32-NEXT: addl %esi, %edi -; WIN32-NEXT: addl %ebp, %ebx -; WIN32-NEXT: addl %eax, %ebp -; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload +; WIN32-NEXT: addl %ecx, %ebx +; WIN32-NEXT: addl %eax, %ecx +; WIN32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; WIN32-NEXT: adcl %ebx, %edi -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull %esi ; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %ebp, %ecx +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; WIN32-NEXT: adcl $0, %ebx -; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %ecx, %ebp -; WIN32-NEXT: adcl %ebx, %esi -; WIN32-NEXT: setb %cl -; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload +; WIN32-NEXT: addl %esi, %ebp +; WIN32-NEXT: adcl %ebx, %ecx +; WIN32-NEXT: setb %bl +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %esi, %eax -; WIN32-NEXT: movzbl %cl, %ecx +; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: movzbl %bl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: adcl %edi, %edx ; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx @@ -1761,7 +1761,7 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $16, %esp +; WIN32-NEXT: addl $20, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -1805,44 +1805,44 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $16, %esp +; WIN32-NEXT: subl $12, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %ebp -; WIN32-NEXT: movl 4(%eax), %ebx -; WIN32-NEXT: movl %ebx, (%esp) # 4-byte Spill +; WIN32-NEXT: movl 4(%eax), %eax ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: addl %eax, %ecx ; WIN32-NEXT: addl %esi, %ecx -; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: addl %ebx, %esi +; WIN32-NEXT: addl %edi, %esi ; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %edi, %eax +; WIN32-NEXT: addl %ebx, %eax ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: adcl %ecx, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; WIN32-NEXT: addl %ebx, %ecx ; WIN32-NEXT: adcl $0, %edi -; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, %ebp @@ -1866,7 +1866,7 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $16, %esp +; WIN32-NEXT: addl $12, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -2221,30 +2221,29 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %esi +; WIN32-NEXT: movl (%eax), %ebp ; WIN32-NEXT: movl 4(%eax), %eax -; WIN32-NEXT: testl %ebx, %ebx +; WIN32-NEXT: testl %esi, %esi ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %cl ; WIN32-NEXT: andb %dl, %cl -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto %ch -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %esi ; WIN32-NEXT: seto %bl -; WIN32-NEXT: orb %ch, %bl -; WIN32-NEXT: orb %cl, %bl -; WIN32-NEXT: leal (%edi,%eax), %ecx ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: addl %ecx, %edx +; WIN32-NEXT: seto %ch +; WIN32-NEXT: orb %bl, %ch +; WIN32-NEXT: orb %cl, %ch +; WIN32-NEXT: leal (%edi,%eax), %esi +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %cl -; WIN32-NEXT: orb %bl, %cl +; WIN32-NEXT: orb %ch, %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl %eax, (%esi) ; WIN32-NEXT: movl %edx, 4(%esi) @@ -2300,9 +2299,9 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl (%edx), %ebp -; WIN32-NEXT: movl 4(%edx), %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl (%ecx), %ebp +; WIN32-NEXT: movl 4(%ecx), %esi ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %esi, %esi diff --git a/llvm/test/MC/RISCV/fixups-binary-expression.s b/llvm/test/MC/RISCV/fixups-binary-expression.s new file mode 100644 index 0000000000000..dc1de5d6b324a --- /dev/null +++ b/llvm/test/MC/RISCV/fixups-binary-expression.s @@ -0,0 +1,31 @@ +# RUN: llvm-mc -triple riscv32 -mattr=+c -riscv-no-aliases < %s -show-encoding \ +# RUN: | FileCheck -check-prefix=CHECK-FIXUP %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c < %s \ +# RUN: | llvm-objdump -M no-aliases -d - \ +# RUN: | FileCheck -check-prefix=CHECK-INSTR %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+c,+relax < %s \ +# RUN: | llvm-readobj -r - | FileCheck -check-prefix=CHECK-RELOC %s + +.LBB0: + +.LBB1: + +jal zero, .LBB0+16 +# CHECK-FIXUP: fixup A - offset: 0, value: .LBB0+16, kind: fixup_riscv_jal +# CHECK-INSTR: jal zero, 0x10 +# CHECK-RELOC: R_RISCV_JAL + +beq a0, a1, .LBB1+32 +# CHECK-FIXUP: fixup A - offset: 0, value: .LBB1+32, kind: fixup_riscv_branch +# CHECK-INSTR: beq a0, a1, 0x20 +# CHECK-RELOC: R_RISCV_BRANCH + +c.j .+32 +# CHECK: fixup A - offset: 0, value: .Ltmp0+32, kind: fixup_riscv_rvc_jump +# CHECK-INSTR: c.j 0x28 +# CHECK-RELOC: R_RISCV_RVC_JUMP + +c.beqz a0, .-2 +# CHECK-FIXUP: fixup A - offset: 0, value: .Ltmp1-2, kind: fixup_riscv_rvc_branch +# CHECK-INSTR: c.beqz a0, 0x8 +# CHECK-RELOC: R_RISCV_RVC_BRANCH diff --git a/llvm/test/MC/RISCV/fixups-invalid.s b/llvm/test/MC/RISCV/fixups-invalid.s index 44313e0164e36..90b2ce32394ff 100644 --- a/llvm/test/MC/RISCV/fixups-invalid.s +++ b/llvm/test/MC/RISCV/fixups-invalid.s @@ -13,3 +13,5 @@ .equ CONST, .Lbuf_end - .Lbuf # CHECK: error: operand must be a constant 12-bit integer li a0, CONST +# CHECK: error: operand must be a constant 12-bit integer +li a0, .Lbuf_end - .Lbuf diff --git a/llvm/test/MC/RISCV/rv32i-aliases-valid.s b/llvm/test/MC/RISCV/rv32i-aliases-valid.s index 99fcdf05f95bb..5263b0359bc7a 100644 --- a/llvm/test/MC/RISCV/rv32i-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv32i-aliases-valid.s @@ -113,6 +113,11 @@ li a0, CONST # CHECK-OBJ-NOALIAS: addi a0, zero, 8 li a0, CONST +# CHECK-ASM: addi a0, zero, .Lbuf_end-.Lbuf +# CHECK-ASM-NOALIAS: addi a0, zero, .Lbuf_end-.Lbuf +# CHECK-OBJ-NOALIAS: addi a0, zero, 8 +li a0, .Lbuf_end - .Lbuf + # CHECK-INST: addi a0, zero, 0 # CHECK-ALIAS: li a0, 0 la x10, 0 diff --git a/llvm/test/MC/RISCV/rv64i-aliases-valid.s b/llvm/test/MC/RISCV/rv64i-aliases-valid.s index 1fc810aaa2bba..a79c938920745 100644 --- a/llvm/test/MC/RISCV/rv64i-aliases-valid.s +++ b/llvm/test/MC/RISCV/rv64i-aliases-valid.s @@ -225,6 +225,11 @@ li a0, CONST # CHECK-OBJ-NOALIAS: addi a0, zero, 8 li a0, CONST +# CHECK-ASM: addi a0, zero, .Lbuf_end-.Lbuf +# CHECK-ASM-NOALIAS: addi a0, zero, .Lbuf_end-.Lbuf +# CHECK-OBJ-NOALIAS: addi a0, zero, 8 +li a0, .Lbuf_end - .Lbuf + # CHECK-INST: addi a0, zero, 0 # CHECK-ALIAS: li a0, 0 la x10, 0 diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll index 1c58b90f77dea..080b3dd75ee9a 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -47,17 +47,17 @@ define void @test2(ptr %struct, i32 %n) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cbz x0, .LBB1_3 ; CHECK-NEXT: // %bb.1: // %while_cond.preheader -; CHECK-NEXT: mov w9, #40000 // =0x9c40 -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x0, x9 -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: mov w8, #40000 // =0x9c40 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.ge .LBB1_3 ; CHECK-NEXT: .LBB1_2: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.lt .LBB1_2 ; CHECK-NEXT: .LBB1_3: // %while_end ; CHECK-NEXT: ret @@ -86,20 +86,20 @@ define void @test3(ptr %s1, ptr %s2, i1 %cond, i32 %n) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: tst w2, #0x1 -; CHECK-NEXT: csel x9, x1, x0, ne -; CHECK-NEXT: cbz x9, .LBB2_3 +; CHECK-NEXT: csel x8, x1, x0, ne +; CHECK-NEXT: cbz x8, .LBB2_3 ; CHECK-NEXT: // %bb.1: // %while_cond.preheader ; CHECK-NEXT: mov w10, #40000 // =0x9c40 -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x9, x10 -; CHECK-NEXT: cmp w8, w3 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: cmp w9, w3 ; CHECK-NEXT: b.ge .LBB2_3 ; CHECK-NEXT: .LBB2_2: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] -; CHECK-NEXT: cmp w8, w3 +; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: cmp w9, w3 ; CHECK-NEXT: b.lt .LBB2_2 ; CHECK-NEXT: .LBB2_3: // %while_end ; CHECK-NEXT: ret @@ -150,8 +150,8 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler { ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_remember_state ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: mov w20, wzr -; CHECK-NEXT: mov w21, #40000 // =0x9c40 +; CHECK-NEXT: mov w21, wzr +; CHECK-NEXT: mov w20, #40000 // =0x9c40 ; CHECK-NEXT: .LBB3_1: // %while_cond ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: .Ltmp0: @@ -159,15 +159,15 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler { ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: // %bb.2: // %while_cond_x.split ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: add x8, x0, x21 -; CHECK-NEXT: cmp w20, w19 +; CHECK-NEXT: add x8, x0, x20 +; CHECK-NEXT: cmp w21, w19 ; CHECK-NEXT: str wzr, [x8] ; CHECK-NEXT: b.ge .LBB3_4 ; CHECK-NEXT: // %bb.3: // %while_body ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: str w20, [x8, #4] -; CHECK-NEXT: add w20, w20, #1 -; CHECK-NEXT: str w20, [x8] +; CHECK-NEXT: str w21, [x8, #4] +; CHECK-NEXT: add w21, w21, #1 +; CHECK-NEXT: str w21, [x8] ; CHECK-NEXT: b .LBB3_1 ; CHECK-NEXT: .LBB3_4: // %while_end ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload @@ -220,18 +220,18 @@ declare i32 @__FrameHandler(...) define void @test5(ptr %s, i32 %n) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x9, #19, lsl #12 // =77824 -; CHECK-NEXT: add x9, x9, #2176 -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: add x8, x8, #19, lsl #12 // =77824 +; CHECK-NEXT: add x8, x8, #2176 +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.ge .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.lt .LBB4_1 ; CHECK-NEXT: .LBB4_2: // %while_end ; CHECK-NEXT: ret diff --git a/llvm/test/Transforms/FunctionSpecialization/bug55000-read-uninitialized-value.ll b/llvm/test/Transforms/FunctionSpecialization/bug55000-read-uninitialized-value.ll index 0cd1538445434..d96460efe3462 100644 --- a/llvm/test/Transforms/FunctionSpecialization/bug55000-read-uninitialized-value.ll +++ b/llvm/test/Transforms/FunctionSpecialization/bug55000-read-uninitialized-value.ll @@ -4,8 +4,8 @@ declare hidden i1 @compare(ptr) align 2 declare hidden { i8, ptr } @getType(ptr) align 2 ; CHECK-LABEL: @foo -; CHECK-LABEL: @foo.1 -; CHECK-LABEL: @foo.2 +; CHECK-LABEL: @foo.specialized.1 +; CHECK-LABEL: @foo.specialized.2 define internal void @foo(ptr %TLI, ptr %DL, ptr %Ty, ptr %ValueVTs, ptr %Offsets, i64 %StartingOffset) { entry: diff --git a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-58759.ll b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-58759.ll index 5cbfaade98d3c..f29cf0d123939 100644 --- a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-58759.ll +++ b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-58759.ll @@ -23,5 +23,5 @@ declare i32 @p0(i32 noundef) declare i32 @p1(i32 noundef) ;; Tests that `f` has been fully specialize and it didn't cause compiler crash. -;; CHECK-DAG: f.1 -;; CHECK-DAG: f.2 +;; CHECK-DAG: f.specialized.1 +;; CHECK-DAG: f.specialized.2 diff --git a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-60191.ll b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-60191.ll index 35364c4b0ad2b..668929824cc6f 100644 --- a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-60191.ll +++ b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-60191.ll @@ -60,7 +60,7 @@ define i32 @f2(i32 %offset) { } ; Tests that `func` has been specialized and it didn't cause compiler crash. -; CHECK-DAG: func.1 -; CHECK-DAG: func.2 -; CHECK-DAG: func.3 +; CHECK-DAG: func.specialized.1 +; CHECK-DAG: func.specialized.2 +; CHECK-DAG: func.specialized.3 diff --git a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-promote-alloca.ll b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-promote-alloca.ll index fff454db303c9..38450ba6819d7 100644 --- a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-promote-alloca.ll +++ b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-promote-alloca.ll @@ -2,7 +2,7 @@ ; Tests that `bar` has been specialized and that the compiler did not crash ; while attempting to promote the alloca in `entry`. -; CHECK: bar.1 +; CHECK: bar.specialized.1 @block = internal constant [8 x i8] zeroinitializer, align 1 diff --git a/llvm/test/Transforms/FunctionSpecialization/constant-struct.ll b/llvm/test/Transforms/FunctionSpecialization/constant-struct.ll index 6c3bfaef49b0a..39df6b05aa5a7 100644 --- a/llvm/test/Transforms/FunctionSpecialization/constant-struct.ll +++ b/llvm/test/Transforms/FunctionSpecialization/constant-struct.ll @@ -8,7 +8,7 @@ define i32 @foo(i32 %y0, i32 %y1) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Y:%.*]] = insertvalue { i32, i32 } undef, i32 [[Y0:%.*]], 0 ; CHECK-NEXT: [[YY:%.*]] = insertvalue { i32, i32 } [[Y]], i32 [[Y1:%.*]], 1 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @add.1({ i32, i32 } { i32 2, i32 3 }, { i32, i32 } [[YY]]) +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @add.specialized.1({ i32, i32 } { i32 2, i32 3 }, { i32, i32 } [[YY]]) ; CHECK-NEXT: ret i32 [[CALL]] ; entry: @@ -23,7 +23,7 @@ define i32 @bar(i32 %x0, i32 %x1) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[X:%.*]] = insertvalue { i32, i32 } undef, i32 [[X0:%.*]], 0 ; CHECK-NEXT: [[XX:%.*]] = insertvalue { i32, i32 } [[X]], i32 [[X1:%.*]], 1 -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @add.2({ i32, i32 } [[XX]], { i32, i32 } { i32 3, i32 2 }) +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @add.specialized.2({ i32, i32 } [[XX]], { i32, i32 } { i32 3, i32 2 }) ; CHECK-NEXT: ret i32 [[CALL]] ; entry: diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll index b5a0084ed52e6..5e65b0db7b235 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll @@ -1,6 +1,6 @@ ; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s -; CHECK-NOT: foo.{{[0-9]+}} +; CHECK-NOT: foo.specialized.{{[0-9]+}} target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll index 57db1cf71a9b9..c242816b91d43 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll @@ -30,10 +30,10 @@ define internal i64 @zoo(i1 %flag) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]] ; CHECK: plus: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @func2.2(ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i64 0, i32 3)) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @func2.specialized.2(ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i64 0, i32 3)) ; CHECK-NEXT: br label [[MERGE:%.*]] ; CHECK: minus: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @func2.1(ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i64 0, i32 4)) +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @func2.specialized.1(ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i64 0, i32 4)) ; CHECK-NEXT: br label [[MERGE]] ; CHECK: merge: ; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ ptrtoint (ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i64 0, i32 3) to i64), [[PLUS]] ], [ ptrtoint (ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i64 0, i32 4) to i64), [[MINUS]] ] diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression3.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression3.ll index b1b7f1fd820d6..7eaa68064607b 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression3.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression3.ll @@ -4,8 +4,8 @@ define i32 @main() { ; CHECK-LABEL: @main( ; CHECK-NEXT: bb: -; CHECK-NEXT: tail call void @wombat.1(ptr undef, i64 undef, i64 undef, ptr @quux) -; CHECK-NEXT: tail call void @wombat.2(ptr undef, i64 undef, i64 undef, ptr @eggs) +; CHECK-NEXT: tail call void @wombat.specialized.1(ptr undef, i64 undef, i64 undef, ptr @quux) +; CHECK-NEXT: tail call void @wombat.specialized.2(ptr undef, i64 undef, i64 undef, ptr @eggs) ; CHECK-NEXT: ret i32 undef ; bb: diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression4.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression4.ll index 9cac9ca95f8bc..0410bfd9407b9 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression4.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression4.ll @@ -2,7 +2,7 @@ ; Check that we don't crash and specialise on a function call with byval attribute. -; CHECK-NOT: wombat.{{[0-9]+}} +; CHECK-NOT: wombat.specialized.{{[0-9]+}} declare ptr @quux() declare ptr @eggs() diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression5.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression5.ll index c53673cf84b63..c1b442be97e62 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression5.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression5.ll @@ -3,7 +3,7 @@ ; Check that we don't crash and specialise on a scalar global variable with byval attribute. -; CHECK-NOT: wombat.{{[0-9]+}} +; CHECK-NOT: wombat.specialized.{{[0-9]+}} %struct.pluto = type { %struct.spam } %struct.quux = type { i16 } diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll index 976a326a4a886..17f9c30122d10 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll @@ -1,8 +1,8 @@ ; RUN: opt -passes="ipsccp" -funcspec-for-literal-constant=true -force-specialization -S < %s | FileCheck %s ; Check that the literal constant parameter could be specialized. -; CHECK: @foo.1( -; CHECK: @foo.2( +; CHECK: @foo.specialized.1( +; CHECK: @foo.specialized.2( target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize.ll index 6cc35403cc4e1..9127e90ce23e8 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize.ll @@ -1,7 +1,7 @@ ; RUN: opt -passes="ipsccp" -S < %s | FileCheck %s -; CHECK-NOT: @compute.1 -; CHECK-NOT: @compute.2 +; CHECK-NOT: @compute.specialized.1 +; CHECK-NOT: @compute.specialized.2 define i64 @main(i64 %x, i1 %flag) { entry: diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize2.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize2.ll index 2d0e04d01dc37..0e2c4836b6930 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize2.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize2.ll @@ -3,8 +3,8 @@ ; Checks for callsites that have been annotated with MinSize. No specialisation ; expected here: ; -; CHECK-NOT: @compute.1 -; CHECK-NOT: @compute.2 +; CHECK-NOT: @compute.specialized.1 +; CHECK-NOT: @compute.specialized.2 define i64 @main(i64 %x, i1 %flag) { entry: diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll index 525721f03cfb2..743ca89e96cc7 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll @@ -4,7 +4,7 @@ ; specialisation for the call that does not have the attribute: ; ; CHECK: plus: -; CHECK: %tmp0 = call i64 @compute.1(i64 %x, ptr @plus) +; CHECK: %tmp0 = call i64 @compute.specialized.1(i64 %x, ptr @plus) ; CHECK: br label %merge ; CHECK: minus: ; CHECK: %tmp1 = call i64 @compute(i64 %x, ptr @minus) #0 diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-nodup.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-nodup.ll index d9dcb44dcdb52..a89aa2db84333 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-nodup.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-nodup.ll @@ -3,8 +3,8 @@ ; Function @foo has function attribute 'noduplicate', so check that we don't ; specialize it: -; CHECK-NOT: @foo.1( -; CHECK-NOT: @foo.2( +; CHECK-NOT: @foo.specialized.1( +; CHECK-NOT: @foo.specialized.2( target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-nodup2.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-nodup2.ll index c950dfa31e4b2..a61013e07b73a 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-nodup2.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-nodup2.ll @@ -5,8 +5,8 @@ ; Please note that the use of the hardwareloop intrinsic is arbitrary; it's ; just an easy to use intrinsic that has NoDuplicate. -; CHECK-NOT: @foo.1( -; CHECK-NOT: @foo.2( +; CHECK-NOT: @foo.specialized.1( +; CHECK-NOT: @foo.specialized.2( target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-noexec.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-noexec.ll index d1e2a77dfc19c..47624c7b1fe33 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-noexec.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-noexec.ll @@ -2,8 +2,8 @@ ; The if.then block is not executed, so check that we don't specialise here. -; CHECK-NOT: @foo.1( -; CHECK-NOT: @foo.2( +; CHECK-NOT: @foo.specialized.1( +; CHECK-NOT: @foo.specialized.2( target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-nonconst-glob.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-nonconst-glob.ll index 54eed8d1346fe..6c22126121613 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-nonconst-glob.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-nonconst-glob.ll @@ -7,8 +7,8 @@ ; Global B is not constant. We do not specialise on addresses unless we ; enable that: -; ON-ADDRESS: call i32 @foo.1(i32 %x, ptr @A) -; ON-ADDRESS: call i32 @foo.2(i32 %y, ptr @B) +; ON-ADDRESS: call i32 @foo.specialized.1(i32 %x, ptr @A) +; ON-ADDRESS: call i32 @foo.specialized.2(i32 %y, ptr @B) target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -21,7 +21,7 @@ define dso_local i32 @bar(i32 %x, i32 %y) { ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[X:%.*]], 0 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[CALL:%.*]] = call i32 @foo.1(i32 [[X]], ptr @A) +; CHECK-NEXT: [[CALL:%.*]] = call i32 @foo.specialized.1(i32 [[X]], ptr @A) ; CHECK-NEXT: br label [[RETURN:%.*]] ; CHECK: if.else: ; CHECK-NEXT: [[CALL1:%.*]] = call i32 @foo(i32 [[Y:%.*]], ptr @B) @@ -60,11 +60,11 @@ entry: ret i32 %add } -; CHECK-LABEL: define internal i32 @foo.1(i32 %x, ptr %b) { +; CHECK-LABEL: define internal i32 @foo.specialized.1(i32 %x, ptr %b) { ; CHECK-NEXT: entry: ; CHECK-NEXT: %0 = load i32, ptr @A, align 4 ; CHECK-NEXT: %add = add nsw i32 %x, %0 ; CHECK-NEXT: ret i32 %add ; CHECK-NEXT: } -; CHECK-NOT: define internal i32 @foo.2( +; CHECK-NOT: define internal i32 @foo.specialized.2( diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive2.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive2.ll index 9c7d3b22c917d..68ad5821dbe6e 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive2.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive2.ll @@ -2,8 +2,8 @@ ; Volatile store preventing recursive specialisation: ; -; CHECK: @recursiveFunc.1 -; CHECK-NOT: @recursiveFunc.2 +; CHECK: @recursiveFunc.specialized.1 +; CHECK-NOT: @recursiveFunc.specialized.2 @Global = internal constant i32 1, align 4 diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive3.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive3.ll index 633138721e554..a98c91d2b82a3 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive3.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive3.ll @@ -2,8 +2,8 @@ ; Duplicate store preventing recursive specialisation: ; -; CHECK: @recursiveFunc.1 -; CHECK-NOT: @recursiveFunc.2 +; CHECK: @recursiveFunc.specialized.1 +; CHECK-NOT: @recursiveFunc.specialized.2 @Global = internal constant i32 1, align 4 diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive4.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive4.ll index 6dca04c17bf4d..e9bee3e50088d 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive4.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-recursive4.ll @@ -2,8 +2,8 @@ ; Alloca is not an integer type: ; -; CHECK: @recursiveFunc.1 -; CHECK-NOT: @recursiveFunc.2 +; CHECK: @recursiveFunc.specialized.1 +; CHECK-NOT: @recursiveFunc.specialized.2 @Global = internal constant i32 1, align 4 diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll index b5d16f6dab1c0..e1cacce4a7dde 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll @@ -7,10 +7,10 @@ define i64 @main(i64 %x, i1 %flag) { ; CHECK: entry: ; CHECK-NEXT: br i1 %flag, label %plus, label %minus ; CHECK: plus: -; CHECK-NEXT: [[TMP0:%.+]] = call i64 @compute.1(i64 %x, ptr @plus) +; CHECK-NEXT: [[TMP0:%.+]] = call i64 @compute.specialized.1(i64 %x, ptr @plus) ; CHECK-NEXT: br label %merge ; CHECK: minus: -; CHECK-NEXT: [[TMP1:%.+]] = call i64 @compute.2(i64 %x, ptr @minus) +; CHECK-NEXT: [[TMP1:%.+]] = call i64 @compute.specialized.2(i64 %x, ptr @minus) ; CHECK-NEXT: br label %merge ; CHECK: merge: ; CHECK-NEXT: [[TMP2:%.+]] = phi i64 [ [[TMP0]], %plus ], [ [[TMP1]], %minus ] @@ -18,7 +18,7 @@ define i64 @main(i64 %x, i1 %flag) { ; CHECK-NEXT: } ; ; NOFSPEC-LABEL: @main(i64 %x, i1 %flag) { -; NOFSPEC-NOT: call i64 @compute.{{[0-9]+}}( +; NOFSPEC-NOT: call i64 @compute.specialized.{{[0-9]+}}( ; NOFSPEC: call i64 @compute( ; entry: @@ -39,20 +39,20 @@ merge: ; CHECK-NOT: define internal i64 @compute( ; -; CHECK-LABEL: define internal i64 @compute.1(i64 %x, ptr %binop) { +; CHECK-LABEL: define internal i64 @compute.specialized.1(i64 %x, ptr %binop) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.+]] = call i64 @plus(i64 %x) ; CHECK-NEXT: ret i64 [[TMP0]] ; CHECK-NEXT: } ; -; CHECK-LABEL: define internal i64 @compute.2(i64 %x, ptr %binop) { +; CHECK-LABEL: define internal i64 @compute.specialized.2(i64 %x, ptr %binop) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.+]] = call i64 @minus(i64 %x) ; CHECK-NEXT: ret i64 [[TMP0]] ; CHECK-NEXT: } ; ; NOFSPEC: define internal i64 @compute( -; NOFSPEC-NOT: define internal i64 @compute.{{[0-9]+}}( +; NOFSPEC-NOT: define internal i64 @compute.specialized.{{[0-9]+}}( ; define internal i64 @compute(i64 %x, ptr %binop) { entry: diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll index 950ed13f7b9e1..b6cdcf18eea42 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll @@ -3,8 +3,8 @@ ; RUN: opt -passes="ipsccp,deadargelim" -funcspec-max-iters=1 -force-specialization -S < %s | FileCheck %s ; RUN: opt -passes="ipsccp,deadargelim" -funcspec-max-iters=0 -force-specialization -S < %s | FileCheck %s --check-prefix=DISABLED -; DISABLED-NOT: @func.1( -; DISABLED-NOT: @func.2( +; DISABLED-NOT: @func.specialized.1( +; DISABLED-NOT: @func.specialized.2( define internal i32 @func(ptr %0, i32 %1, ptr nocapture %2) { %4 = alloca i32, align 4 @@ -42,15 +42,15 @@ define internal void @decrement(ptr nocapture %0) { } define i32 @main(ptr %0, i32 %1) { -; CHECK: call void @func.2(ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) +; CHECK: call void @func.specialized.2(ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) %3 = call i32 @func(ptr %0, i32 %1, ptr nonnull @increment) -; CHECK: call void @func.1(ptr [[TMP0]], i32 0) +; CHECK: call void @func.specialized.1(ptr [[TMP0]], i32 0) %4 = call i32 @func(ptr %0, i32 %3, ptr nonnull @decrement) ; CHECK: ret i32 0 ret i32 %4 } -; CHECK: @func.1( +; CHECK: @func.specialized.1( ; CHECK: [[TMP3:%.*]] = alloca i32, align 4 ; CHECK: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 ; CHECK: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -63,13 +63,13 @@ define i32 @main(ptr %0, i32 %1) { ; CHECK: call void @decrement(ptr [[TMP9]]) ; CHECK: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 -; CHECK: call void @func.1(ptr [[TMP0]], i32 [[TMP11]]) +; CHECK: call void @func.specialized.1(ptr [[TMP0]], i32 [[TMP11]]) ; CHECK: br label [[TMP12:%.*]] ; CHECK: 12: ; CHECK: ret void ; ; -; CHECK: @func.2( +; CHECK: @func.specialized.2( ; CHECK: [[TMP3:%.*]] = alloca i32, align 4 ; CHECK: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 ; CHECK: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 @@ -82,7 +82,7 @@ define i32 @main(ptr %0, i32 %1) { ; CHECK: call void @increment(ptr [[TMP9]]) ; CHECK: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 ; CHECK: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 -; CHECK: call void @func.2(ptr [[TMP0]], i32 [[TMP11]]) +; CHECK: call void @func.specialized.2(ptr [[TMP0]], i32 [[TMP11]]) ; CHECK: br label [[TMP12:%.*]] ; CHECK: 12: ; CHECK: ret void diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll index d80b6dfcf18aa..8e075edaa6844 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll @@ -12,9 +12,9 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define dso_local i32 @bar(i32 %x, i32 %y) { ; COMMON-LABEL: @bar -; FORCE: %call = call i32 @foo.1(i32 %x, ptr @A) -; FORCE: %call1 = call i32 @foo.2(i32 %y, ptr @B) -; DISABLED-NOT: %call1 = call i32 @foo.1( +; FORCE: %call = call i32 @foo.specialized.1(i32 %x, ptr @A) +; FORCE: %call1 = call i32 @foo.specialized.2(i32 %y, ptr @B) +; DISABLED-NOT: %call1 = call i32 @foo.specialized.1( entry: %tobool = icmp ne i32 %x, 0 br i1 %tobool, label %if.then, label %if.else @@ -34,14 +34,14 @@ return: ; FORCE-NOT: define internal i32 @foo( ; -; FORCE: define internal i32 @foo.1(i32 %x, ptr %b) { +; FORCE: define internal i32 @foo.specialized.1(i32 %x, ptr %b) { ; FORCE-NEXT: entry: ; FORCE-NEXT: %0 = load i32, ptr @A, align 4 ; FORCE-NEXT: %add = add nsw i32 %x, %0 ; FORCE-NEXT: ret i32 %add ; FORCE-NEXT: } ; -; FORCE: define internal i32 @foo.2(i32 %x, ptr %b) { +; FORCE: define internal i32 @foo.specialized.2(i32 %x, ptr %b) { ; FORCE-NEXT: entry: ; FORCE-NEXT: %0 = load i32, ptr @B, align 4 ; FORCE-NEXT: %add = add nsw i32 %x, %0 diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll index 0e859a0a24feb..4e5a196d66829 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization4.ll @@ -39,10 +39,10 @@ entry: ret i32 %add1 } -; CONST1: define internal i32 @foo.1(i32 %x, ptr %b, ptr %c) -; CONST1-NOT: define internal i32 @foo.2(i32 %x, ptr %b, ptr %c) +; CONST1: define internal i32 @foo.specialized.1(i32 %x, ptr %b, ptr %c) +; CONST1-NOT: define internal i32 @foo.specialized.2(i32 %x, ptr %b, ptr %c) -; CHECK: define internal i32 @foo.1(i32 %x, ptr %b, ptr %c) { +; CHECK: define internal i32 @foo.specialized.1(i32 %x, ptr %b, ptr %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: %0 = load i32, ptr @A, align 4 ; CHECK-NEXT: %add = add nsw i32 %x, %0 @@ -51,7 +51,7 @@ entry: ; CHECK-NEXT: ret i32 %add1 ; CHECK-NEXT: } -; CHECK: define internal i32 @foo.2(i32 %x, ptr %b, ptr %c) { +; CHECK: define internal i32 @foo.specialized.2(i32 %x, ptr %b, ptr %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: %0 = load i32, ptr @B, align 4 ; CHECK-NEXT: %add = add nsw i32 %x, %0 diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization5.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization5.ll index b272510b3939f..aff1b770f8c81 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization5.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization5.ll @@ -2,8 +2,8 @@ ; There's nothing to specialize here as both calls are the same, so check that: ; -; CHECK-NOT: define internal i32 @foo.1( -; CHECK-NOT: define internal i32 @foo.2( +; CHECK-NOT: define internal i32 @foo.specialized.1( +; CHECK-NOT: define internal i32 @foo.specialized.2( target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/FunctionSpecialization/get-possible-constants.ll b/llvm/test/Transforms/FunctionSpecialization/get-possible-constants.ll index 9b14db5399f3d..dfa1e5a42776a 100644 --- a/llvm/test/Transforms/FunctionSpecialization/get-possible-constants.ll +++ b/llvm/test/Transforms/FunctionSpecialization/get-possible-constants.ll @@ -12,7 +12,7 @@ entry: } ; CHECK-LABEL: define dso_local i32 @f0 -; CHECK: tail call fastcc i32 @g.[[#A:]]({{.*}}@p0) +; CHECK: tail call fastcc i32 @g.specialized.[[#A:]]({{.*}}@p0) ; define dso_local i32 @f0(i32 noundef %x) { entry: @@ -21,7 +21,7 @@ entry: } ; CHECK-LABEL: define dso_local i32 @f1 -; CHECK: tail call fastcc i32 @g.[[#B:]]({{.*}}@p1) +; CHECK: tail call fastcc i32 @g.specialized.[[#B:]]({{.*}}@p1) ; define dso_local i32 @f1(i32 noundef %x) { entry: @@ -40,7 +40,7 @@ entry: } ; CHECK-LABEL: define dso_local i32 @g0 -; CHECK: tail call fastcc i32 @f.[[#C:]]({{.*}}@p0) +; CHECK: tail call fastcc i32 @f.specialized.[[#C:]]({{.*}}@p0) ; define dso_local i32 @g0(i32 noundef %x) { entry: @@ -56,7 +56,7 @@ entry: } ; CHECK-LABEL: define dso_local i32 @g1 -; CHECK: tail call fastcc i32 @f.[[#D:]]({{.*}}@p1) +; CHECK: tail call fastcc i32 @f.specialized.[[#D:]]({{.*}}@p1) ; define dso_local i32 @g1(i32 noundef %x) { entry: @@ -76,7 +76,7 @@ entry: ; Also check that for callsites which reside in the body of newly created ; (specialized) functions, the lattice value of the arguments is known. ; -; CHECK-DAG: define internal fastcc i32 @g.[[#A]] -; CHECK-DAG: define internal fastcc i32 @g.[[#B]] -; CHECK-DAG: define internal fastcc i32 @f.[[#C]] -; CHECK-DAG: define internal fastcc i32 @f.[[#D]] +; CHECK-DAG: define internal fastcc i32 @g.specialized.[[#A]] +; CHECK-DAG: define internal fastcc i32 @g.specialized.[[#B]] +; CHECK-DAG: define internal fastcc i32 @f.specialized.[[#C]] +; CHECK-DAG: define internal fastcc i32 @f.specialized.[[#D]] diff --git a/llvm/test/Transforms/FunctionSpecialization/global-rank.ll b/llvm/test/Transforms/FunctionSpecialization/global-rank.ll index 541faa2e19515..1926e29ddee01 100644 --- a/llvm/test/Transforms/FunctionSpecialization/global-rank.ll +++ b/llvm/test/Transforms/FunctionSpecialization/global-rank.ll @@ -47,5 +47,5 @@ declare i32 @qq(i32 noundef) ; `f` to be chosen, whereas the old algorithm would choose ; one specialsation of `f` and one of `g`. -; CHECK-DAG: define internal i32 @f.1 -; CHECK-DAG: define internal i32 @f.2 +; CHECK-DAG: define internal i32 @f.specialized.1 +; CHECK-DAG: define internal i32 @f.specialized.2 diff --git a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll index 14c4855b58a24..b9481baae60b9 100644 --- a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll +++ b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll @@ -65,14 +65,14 @@ entry: ; is allowed, then it is performed where possible. ; GLOBALS-LABEL: define internal i32 @g() -; GLOBALS: call i32 @f.2() +; GLOBALS: call i32 @f.specialized.2() ; GLOBALS-LABEL: define i32 @h0(ptr %p) ; GLOBALS: call i32 @g() ; GLOBALS-LABEL: define i32 @h1() -; GLOBALS: call i32 @f.2() +; GLOBALS: call i32 @f.specialized.2() ; GLOBALS-LABEL: define i32 @h2() -; GLOBALS: call i32 @f.1() +; GLOBALS: call i32 @f.specialized.1() diff --git a/llvm/test/Transforms/FunctionSpecialization/identical-specializations.ll b/llvm/test/Transforms/FunctionSpecialization/identical-specializations.ll index c2ba0920c2be3..368f3b0440344 100644 --- a/llvm/test/Transforms/FunctionSpecialization/identical-specializations.ll +++ b/llvm/test/Transforms/FunctionSpecialization/identical-specializations.ll @@ -6,14 +6,14 @@ define i64 @main(i64 %x, i64 %y, i1 %flag) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]] ; CHECK: plus: -; CHECK-NEXT: [[CMP0:%.*]] = call i64 @compute.2(i64 [[X:%.*]], i64 [[Y:%.*]], ptr @plus, ptr @minus) +; CHECK-NEXT: [[CMP0:%.*]] = call i64 @compute.specialized.2(i64 [[X:%.*]], i64 [[Y:%.*]], ptr @plus, ptr @minus) ; CHECK-NEXT: br label [[MERGE:%.*]] ; CHECK: minus: -; CHECK-NEXT: [[CMP1:%.*]] = call i64 @compute.3(i64 [[X]], i64 [[Y]], ptr @minus, ptr @plus) +; CHECK-NEXT: [[CMP1:%.*]] = call i64 @compute.specialized.3(i64 [[X]], i64 [[Y]], ptr @minus, ptr @plus) ; CHECK-NEXT: br label [[MERGE]] ; CHECK: merge: ; CHECK-NEXT: [[PH:%.*]] = phi i64 [ [[CMP0]], [[PLUS]] ], [ [[CMP1]], [[MINUS]] ] -; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.2(i64 [[PH]], i64 42, ptr @plus, ptr @minus) +; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.specialized.2(i64 [[PH]], i64 42, ptr @plus, ptr @minus) ; CHECK-NEXT: ret i64 [[CMP2]] ; entry: @@ -60,20 +60,20 @@ entry: ret i64 %sub } -; CHECK-LABEL: @compute.1 +; CHECK-LABEL: @compute.specialized.1 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP0:%.*]] = call i64 %binop1(i64 [[X:%.*]], i64 [[Y:%.*]]) ; CHECK-NEXT: [[CMP1:%.*]] = call i64 @plus(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], ptr %binop1, ptr @plus) +; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.specialized.1(i64 [[X]], i64 [[Y]], ptr %binop1, ptr @plus) -; CHECK-LABEL: @compute.2 +; CHECK-LABEL: @compute.specialized.2 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP0:%.*]] = call i64 @plus(i64 [[X:%.*]], i64 [[Y:%.*]]) ; CHECK-NEXT: [[CMP1:%.*]] = call i64 @minus(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], ptr @plus, ptr @plus) +; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.specialized.1(i64 [[X]], i64 [[Y]], ptr @plus, ptr @plus) -; CHECK-LABEL: @compute.3 +; CHECK-LABEL: @compute.specialized.3 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP0:%.*]] = call i64 @minus(i64 [[X:%.*]], i64 [[Y:%.*]]) ; CHECK-NEXT: [[CMP1:%.*]] = call i64 @plus(i64 [[X]], i64 [[Y]]) -; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.3(i64 [[X]], i64 [[Y]], ptr @minus, ptr @plus) +; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.specialized.3(i64 [[X]], i64 [[Y]], ptr @minus, ptr @plus) diff --git a/llvm/test/Transforms/FunctionSpecialization/literal-const.ll b/llvm/test/Transforms/FunctionSpecialization/literal-const.ll index fc400202ab91e..f107ffe0ec7eb 100644 --- a/llvm/test/Transforms/FunctionSpecialization/literal-const.ll +++ b/llvm/test/Transforms/FunctionSpecialization/literal-const.ll @@ -66,27 +66,27 @@ entry: ; CHECK-NOLIT-NOT: @addf. ; CHECK-LIT-LABEL: define i32 @f0 -; CHECK-LIT: call i32 @neg.[[#A:]] +; CHECK-LIT: call i32 @neg.specialized.[[#A:]] ; CHECK-LIT-LABEL: define i32 @f1 -; CHECK-LIT: call i32 @neg.[[#B:]] +; CHECK-LIT: call i32 @neg.specialized.[[#B:]] ; CHECK-LIT-LABEL: define i32 @g0 -; CHECK-LIT: call i32 @add.[[#C:]] +; CHECK-LIT: call i32 @add.specialized.[[#C:]] ; CHECK-LIT-LABEL: define i32 @g1 -; CHECK-LIT: call i32 @add.[[#D:]] +; CHECK-LIT: call i32 @add.specialized.[[#D:]] ; CHECK-LIT-LABEL: define float @h0 -; CHECK-LIT: call float @addf.[[#E:]] +; CHECK-LIT: call float @addf.specialized.[[#E:]] ; CHECK-LIT-LABEL: define float @h1 -; CHECK-LIT: call float @addf.[[#F:]] +; CHECK-LIT: call float @addf.specialized.[[#F:]] ; Check all of `neg`, `add`, and `addf` were specialised. -; CHECK-LIT-DAG: @neg.[[#A]] -; CHECK-LIT-DAG: @neg.[[#B]] -; CHECK-LIT-DAG: @add.[[#C]] -; CHECK-LIT-DAG: @add.[[#D]] -; CHECK-LIT-DAG: @addf.[[#E]] -; CHECK-LIT-DAG: @addf.[[#F]] +; CHECK-LIT-DAG: @neg.specialized.[[#A]] +; CHECK-LIT-DAG: @neg.specialized.[[#B]] +; CHECK-LIT-DAG: @add.specialized.[[#C]] +; CHECK-LIT-DAG: @add.specialized.[[#D]] +; CHECK-LIT-DAG: @addf.specialized.[[#E]] +; CHECK-LIT-DAG: @addf.specialized.[[#F]] diff --git a/llvm/test/Transforms/FunctionSpecialization/no-spec-unused-arg.ll b/llvm/test/Transforms/FunctionSpecialization/no-spec-unused-arg.ll index 38345652bfc8c..8469727e974fe 100644 --- a/llvm/test/Transforms/FunctionSpecialization/no-spec-unused-arg.ll +++ b/llvm/test/Transforms/FunctionSpecialization/no-spec-unused-arg.ll @@ -17,4 +17,4 @@ define i32 @g1() { ; to be a constant without the need for function specialisation and ; the second parameter is unused. -; CHECK-NOT: @f. +; CHECK-NOT: @f.specialized. diff --git a/llvm/test/Transforms/FunctionSpecialization/noinline.ll b/llvm/test/Transforms/FunctionSpecialization/noinline.ll index 863e6e74eb23c..73576402b0029 100644 --- a/llvm/test/Transforms/FunctionSpecialization/noinline.ll +++ b/llvm/test/Transforms/FunctionSpecialization/noinline.ll @@ -31,5 +31,5 @@ entry: } ; Check that a noinline function is specialized, even if it's small. -; CHECK: @f.1 -; CHECK: @f.2 +; CHECK: @f.specialized.1 +; CHECK: @f.specialized.2 diff --git a/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll b/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll index 14a6fd746d09e..9446e557da758 100644 --- a/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll +++ b/llvm/test/Transforms/FunctionSpecialization/non-argument-tracked.ll @@ -29,9 +29,9 @@ define internal i32 @f2(i32 %i) { ;; All calls are to specilisation instances. ; CHECK-LABEL: define i32 @g0 -; CHECK: call void @f0.[[#A:]]() -; CHECK-NEXT: call void @f1.[[#B:]]() -; CHECK-NEXT: call void @f2.[[#C:]]() +; CHECK: call void @f0.specialized.[[#A:]]() +; CHECK-NEXT: call void @f1.specialized.[[#B:]]() +; CHECK-NEXT: call void @f2.specialized.[[#C:]]() ; CHECK-NEXT: ret i32 9 define i32 @g0(i32 %i) { %u0 = call i32 @f0(i32 1) @@ -43,9 +43,9 @@ define i32 @g0(i32 %i) { } ; CHECK-LABEL: define i32 @g1 -; CHECK: call void @f0.[[#D:]]() -; CHECK-NEXT: call void @f1.[[#E:]]() -; CHECK-NEXT: call void @f2.[[#F:]]() +; CHECK: call void @f0.specialized.[[#D:]]() +; CHECK-NEXT: call void @f1.specialized.[[#E:]]() +; CHECK-NEXT: call void @f2.specialized.[[#F:]]() ; CHECK-NEXT: ret i32 12 define i32 @g1(i32 %i) { %u0 = call i32 @f0(i32 2) @@ -58,9 +58,9 @@ define i32 @g1(i32 %i) { ; All of the function are specialized and all clones are with internal linkage. -; CHECK-DAG: define internal void @f0.[[#A]]() { -; CHECK-DAG: define internal void @f1.[[#B]]() { -; CHECK-DAG: define internal void @f2.[[#C]]() { -; CHECK-DAG: define internal void @f0.[[#D]]() { -; CHECK-DAG: define internal void @f1.[[#E]]() { -; CHECK-DAG: define internal void @f2.[[#F]]() { +; CHECK-DAG: define internal void @f0.specialized.[[#A]]() { +; CHECK-DAG: define internal void @f1.specialized.[[#B]]() { +; CHECK-DAG: define internal void @f2.specialized.[[#C]]() { +; CHECK-DAG: define internal void @f0.specialized.[[#D]]() { +; CHECK-DAG: define internal void @f1.specialized.[[#E]]() { +; CHECK-DAG: define internal void @f2.specialized.[[#F]]() { diff --git a/llvm/test/Transforms/FunctionSpecialization/promoteContantStackValues.ll b/llvm/test/Transforms/FunctionSpecialization/promoteContantStackValues.ll index 256cbebae0620..e75e0476957eb 100644 --- a/llvm/test/Transforms/FunctionSpecialization/promoteContantStackValues.ll +++ b/llvm/test/Transforms/FunctionSpecialization/promoteContantStackValues.ll @@ -27,53 +27,53 @@ ret.block: ret void } -; ITERS1: @funcspec.arg = internal constant i32 0 -; ITERS1: @funcspec.arg.1 = internal constant i32 6 -; ITERS1: @funcspec.arg.3 = internal constant i32 1 -; ITERS1: @funcspec.arg.4 = internal constant i32 5 +; ITERS1: @specialized.arg.1 = internal constant i32 0 +; ITERS1: @specialized.arg.2 = internal constant i32 6 +; ITERS1: @specialized.arg.3 = internal constant i32 1 +; ITERS1: @specialized.arg.4 = internal constant i32 5 -; ITERS2: @funcspec.arg = internal constant i32 0 -; ITERS2: @funcspec.arg.1 = internal constant i32 6 -; ITERS2: @funcspec.arg.3 = internal constant i32 1 -; ITERS2: @funcspec.arg.4 = internal constant i32 5 -; ITERS2: @funcspec.arg.6 = internal constant i32 2 -; ITERS2: @funcspec.arg.7 = internal constant i32 4 +; ITERS2: @specialized.arg.1 = internal constant i32 0 +; ITERS2: @specialized.arg.2 = internal constant i32 6 +; ITERS2: @specialized.arg.3 = internal constant i32 1 +; ITERS2: @specialized.arg.4 = internal constant i32 5 +; ITERS2: @specialized.arg.5 = internal constant i32 2 +; ITERS2: @specialized.arg.6 = internal constant i32 4 -; ITERS3: @funcspec.arg = internal constant i32 0 -; ITERS3: @funcspec.arg.1 = internal constant i32 6 -; ITERS3: @funcspec.arg.3 = internal constant i32 1 -; ITERS3: @funcspec.arg.4 = internal constant i32 5 -; ITERS3: @funcspec.arg.6 = internal constant i32 2 -; ITERS3: @funcspec.arg.7 = internal constant i32 4 -; ITERS3: @funcspec.arg.9 = internal constant i32 3 -; ITERS3: @funcspec.arg.10 = internal constant i32 3 +; ITERS3: @specialized.arg.1 = internal constant i32 0 +; ITERS3: @specialized.arg.2 = internal constant i32 6 +; ITERS3: @specialized.arg.3 = internal constant i32 1 +; ITERS3: @specialized.arg.4 = internal constant i32 5 +; ITERS3: @specialized.arg.5 = internal constant i32 2 +; ITERS3: @specialized.arg.6 = internal constant i32 4 +; ITERS3: @specialized.arg.7 = internal constant i32 3 +; ITERS3: @specialized.arg.8 = internal constant i32 3 -; ITERS4: @funcspec.arg = internal constant i32 0 -; ITERS4: @funcspec.arg.1 = internal constant i32 6 -; ITERS4: @funcspec.arg.3 = internal constant i32 1 -; ITERS4: @funcspec.arg.4 = internal constant i32 5 -; ITERS4: @funcspec.arg.6 = internal constant i32 2 -; ITERS4: @funcspec.arg.7 = internal constant i32 4 -; ITERS4: @funcspec.arg.9 = internal constant i32 3 -; ITERS4: @funcspec.arg.10 = internal constant i32 3 +; ITERS4: @specialized.arg.1 = internal constant i32 0 +; ITERS4: @specialized.arg.2 = internal constant i32 6 +; ITERS4: @specialized.arg.3 = internal constant i32 1 +; ITERS4: @specialized.arg.4 = internal constant i32 5 +; ITERS4: @specialized.arg.5 = internal constant i32 2 +; ITERS4: @specialized.arg.6 = internal constant i32 4 +; ITERS4: @specialized.arg.7 = internal constant i32 3 +; ITERS4: @specialized.arg.8 = internal constant i32 3 define i32 @main() { ; ITERS1-LABEL: @main( ; ITERS1-NEXT: call void @print_val(i32 0, i32 6) -; ITERS1-NEXT: call void @recursiveFunc(ptr nonnull @funcspec.arg.3, i32 1, ptr nonnull @funcspec.arg.4) +; ITERS1-NEXT: call void @recursiveFunc(ptr nonnull @specialized.arg.3, i32 1, ptr nonnull @specialized.arg.4) ; ITERS1-NEXT: ret i32 0 ; ; ITERS2-LABEL: @main( ; ITERS2-NEXT: call void @print_val(i32 0, i32 6) ; ITERS2-NEXT: call void @print_val(i32 1, i32 5) -; ITERS2-NEXT: call void @recursiveFunc(ptr nonnull @funcspec.arg.6, i32 1, ptr nonnull @funcspec.arg.7) +; ITERS2-NEXT: call void @recursiveFunc(ptr nonnull @specialized.arg.5, i32 1, ptr nonnull @specialized.arg.6) ; ITERS2-NEXT: ret i32 0 ; ; ITERS3-LABEL: @main( ; ITERS3-NEXT: call void @print_val(i32 0, i32 6) ; ITERS3-NEXT: call void @print_val(i32 1, i32 5) ; ITERS3-NEXT: call void @print_val(i32 2, i32 4) -; ITERS3-NEXT: call void @recursiveFunc(ptr nonnull @funcspec.arg.9, i32 1, ptr nonnull @funcspec.arg.10) +; ITERS3-NEXT: call void @recursiveFunc(ptr nonnull @specialized.arg.7, i32 1, ptr nonnull @specialized.arg.8) ; ITERS3-NEXT: ret i32 0 ; ; ITERS4-LABEL: @main( diff --git a/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll b/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll index 4233998ad9f6d..810526532c106 100644 --- a/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll +++ b/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll @@ -19,14 +19,14 @@ merge: ; CHECK-NOT: define internal i64 @compute( ; -; CHECK-LABEL: define internal i64 @compute.1(i64 %n, ptr %binop) { +; CHECK-LABEL: define internal i64 @compute.specialized.1(i64 %n, ptr %binop) { ; CHECK: [[TMP0:%.+]] = call i64 @plus(i64 %n) -; CHECK: [[TMP1:%.+]] = call i64 @compute.1(i64 [[TMP2:%.+]], ptr @plus) +; CHECK: [[TMP1:%.+]] = call i64 @compute.specialized.1(i64 [[TMP2:%.+]], ptr @plus) ; CHECK: add nsw i64 [[TMP1]], [[TMP0]] ; -; CHECK-LABEL: define internal i64 @compute.2(i64 %n, ptr %binop) { +; CHECK-LABEL: define internal i64 @compute.specialized.2(i64 %n, ptr %binop) { ; CHECK: [[TMP0:%.+]] = call i64 @minus(i64 %n) -; CHECK: [[TMP1:%.+]] = call i64 @compute.2(i64 [[TMP2:%.+]], ptr @minus) +; CHECK: [[TMP1:%.+]] = call i64 @compute.specialized.2(i64 [[TMP2:%.+]], ptr @minus) ; CHECK: add nsw i64 [[TMP1]], [[TMP0]] ; define internal i64 @compute(i64 %n, ptr %binop) { diff --git a/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll b/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll index e0afb3bef1039..da4cb40fb6dc5 100644 --- a/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll +++ b/llvm/test/Transforms/FunctionSpecialization/specialization-order.ll @@ -21,7 +21,7 @@ entry: define dso_local i32 @g0(i32 %x, i32 %y) { ; CHECK-LABEL: @g0 -; CHECK: call i32 @f.3(i32 [[X:%.*]], i32 [[Y:%.*]]) +; CHECK: call i32 @f.specialized.3(i32 [[X:%.*]], i32 [[Y:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @add, ptr @add) ret i32 %call @@ -30,7 +30,7 @@ entry: define dso_local i32 @g1(i32 %x, i32 %y) { ; CHECK-LABEL: @g1( -; CHECK: call i32 @f.2(i32 [[X:%.*]], i32 [[Y:%.*]]) +; CHECK: call i32 @f.specialized.2(i32 [[X:%.*]], i32 [[Y:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @sub, ptr @add) ret i32 %call @@ -38,21 +38,21 @@ entry: define dso_local i32 @g2(i32 %x, i32 %y, ptr %v) { ; CHECK-LABEL: @g2 -; CHECK: call i32 @f.1(i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[V:%.*]]) +; CHECK: call i32 @f.specialized.1(i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[V:%.*]]) entry: %call = tail call i32 @f(i32 %x, i32 %y, ptr @sub, ptr %v) ret i32 %call } -; CHECK-LABEL: define {{.*}} i32 @f.1 +; CHECK-LABEL: define {{.*}} i32 @f.specialized.1 ; CHECK: call i32 @sub(i32 %x, i32 %y) ; CHECK-NEXT: call i32 %v(i32 %x, i32 %y) -; CHECK-LABEL: define {{.*}} i32 @f.2 +; CHECK-LABEL: define {{.*}} i32 @f.specialized.2 ; CHECK: call i32 @sub(i32 %x, i32 %y) ; CHECK-NEXT: call i32 @add(i32 %x, i32 %y) -; CHECK-LABEL: define {{.*}} i32 @f.3 +; CHECK-LABEL: define {{.*}} i32 @f.specialized.3 ; CHECK: call i32 @add(i32 %x, i32 %y) ; CHECK-NEXT: call i32 @add(i32 %x, i32 %y) diff --git a/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll index d1c23e07d5972..a653760abb2cc 100644 --- a/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll +++ b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll @@ -38,7 +38,7 @@ define i64 @main(i64 %x, i64 %y, i1 %flag) { ; ONE-NEXT: [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], ptr @power, ptr @mul) ; ONE-NEXT: br label [[MERGE:%.*]] ; ONE: minus: -; ONE-NEXT: [[TMP1:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], ptr @plus, ptr @minus) +; ONE-NEXT: [[TMP1:%.*]] = call i64 @compute.specialized.1(i64 [[X]], i64 [[Y]], ptr @plus, ptr @minus) ; ONE-NEXT: br label [[MERGE]] ; ONE: merge: ; ONE-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ] @@ -52,25 +52,25 @@ define i64 @main(i64 %x, i64 %y, i1 %flag) { ; TWO-NEXT: [[TMP0:%.*]] = call i64 @compute(i64 [[X:%.*]], i64 [[Y:%.*]], ptr @power, ptr @mul) ; TWO-NEXT: br label [[MERGE:%.*]] ; TWO: minus: -; TWO-NEXT: [[TMP1:%.*]] = call i64 @compute.2(i64 [[X]], i64 [[Y]], ptr @plus, ptr @minus) +; TWO-NEXT: [[TMP1:%.*]] = call i64 @compute.specialized.2(i64 [[X]], i64 [[Y]], ptr @plus, ptr @minus) ; TWO-NEXT: br label [[MERGE]] ; TWO: merge: ; TWO-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ] -; TWO-NEXT: [[TMP3:%.*]] = call i64 @compute.1(i64 [[TMP2]], i64 42, ptr @minus, ptr @power) +; TWO-NEXT: [[TMP3:%.*]] = call i64 @compute.specialized.1(i64 [[TMP2]], i64 42, ptr @minus, ptr @power) ; TWO-NEXT: ret i64 [[TMP3]] ; ; THREE-LABEL: @main( ; THREE-NEXT: entry: ; THREE-NEXT: br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]] ; THREE: plus: -; THREE-NEXT: [[TMP0:%.*]] = call i64 @compute.1(i64 [[X:%.*]], i64 [[Y:%.*]], ptr @power, ptr @mul) +; THREE-NEXT: [[TMP0:%.*]] = call i64 @compute.specialized.1(i64 [[X:%.*]], i64 [[Y:%.*]], ptr @power, ptr @mul) ; THREE-NEXT: br label [[MERGE:%.*]] ; THREE: minus: -; THREE-NEXT: [[TMP1:%.*]] = call i64 @compute.2(i64 [[X]], i64 [[Y]], ptr @plus, ptr @minus) +; THREE-NEXT: [[TMP1:%.*]] = call i64 @compute.specialized.2(i64 [[X]], i64 [[Y]], ptr @plus, ptr @minus) ; THREE-NEXT: br label [[MERGE]] ; THREE: merge: ; THREE-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP0]], [[PLUS]] ], [ [[TMP1]], [[MINUS]] ] -; THREE-NEXT: [[TMP3:%.*]] = call i64 @compute.3(i64 [[TMP2]], i64 42, ptr @minus, ptr @power) +; THREE-NEXT: [[TMP3:%.*]] = call i64 @compute.specialized.3(i64 [[TMP2]], i64 42, ptr @minus, ptr @power) ; THREE-NEXT: ret i64 [[TMP3]] ; entry: @@ -92,7 +92,7 @@ merge: ; THREE-NOT: define internal i64 @compute ; -; THREE-LABEL: define internal i64 @compute.1(i64 %x, i64 %y, ptr %binop1, ptr %binop2) { +; THREE-LABEL: define internal i64 @compute.specialized.1(i64 %x, i64 %y, ptr %binop1, ptr %binop2) { ; THREE-NEXT: entry: ; THREE-NEXT: [[TMP0:%.+]] = call i64 @power(i64 %x, i64 %y) ; THREE-NEXT: [[TMP1:%.+]] = call i64 @mul(i64 %x, i64 %y) @@ -103,7 +103,7 @@ merge: ; THREE-NEXT: ret i64 [[TMP5]] ; THREE-NEXT: } ; -; THREE-LABEL: define internal i64 @compute.2(i64 %x, i64 %y, ptr %binop1, ptr %binop2) { +; THREE-LABEL: define internal i64 @compute.specialized.2(i64 %x, i64 %y, ptr %binop1, ptr %binop2) { ; THREE-NEXT: entry: ; THREE-NEXT: [[TMP0:%.+]] = call i64 @plus(i64 %x, i64 %y) ; THREE-NEXT: [[TMP1:%.+]] = call i64 @minus(i64 %x, i64 %y) @@ -114,7 +114,7 @@ merge: ; THREE-NEXT: ret i64 [[TMP5]] ; THREE-NEXT: } ; -; THREE-LABEL: define internal i64 @compute.3(i64 %x, i64 %y, ptr %binop1, ptr %binop2) { +; THREE-LABEL: define internal i64 @compute.specialized.3(i64 %x, i64 %y, ptr %binop1, ptr %binop2) { ; THREE-NEXT: entry: ; THREE-NEXT: [[TMP0:%.+]] = call i64 @minus(i64 %x, i64 %y) ; THREE-NEXT: [[TMP1:%.+]] = call i64 @power(i64 %x, i64 %y) diff --git a/llvm/test/Transforms/FunctionSpecialization/track-return.ll b/llvm/test/Transforms/FunctionSpecialization/track-return.ll index 58a1c5f2a5904..54e5de018f19c 100644 --- a/llvm/test/Transforms/FunctionSpecialization/track-return.ll +++ b/llvm/test/Transforms/FunctionSpecialization/track-return.ll @@ -3,8 +3,8 @@ define i64 @main() { ; CHECK: define i64 @main ; CHECK-NEXT: entry: -; CHECK-NEXT: [[C1:%.*]] = call i64 @foo.1(i1 true, i64 3, i64 1) -; CHECK-NEXT: [[C2:%.*]] = call i64 @foo.2(i1 false, i64 4, i64 -1) +; CHECK-NEXT: [[C1:%.*]] = call i64 @foo.specialized.1(i1 true, i64 3, i64 1) +; CHECK-NEXT: [[C2:%.*]] = call i64 @foo.specialized.2(i1 false, i64 4, i64 -1) ; CHECK-NEXT: ret i64 8 ; entry: @@ -16,22 +16,22 @@ entry: define internal i64 @foo(i1 %flag, i64 %m, i64 %n) { ; -; CHECK: define internal i64 @foo.1 +; CHECK: define internal i64 @foo.specialized.1 ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %plus ; CHECK: plus: -; CHECK-NEXT: [[N0:%.*]] = call i64 @binop.4(i64 3, i64 1) -; CHECK-NEXT: [[RES0:%.*]] = call i64 @bar.6(i64 4) +; CHECK-NEXT: [[N0:%.*]] = call i64 @binop.specialized.4(i64 3, i64 1) +; CHECK-NEXT: [[RES0:%.*]] = call i64 @bar.specialized.6(i64 4) ; CHECK-NEXT: br label %merge ; CHECK: merge: ; CHECK-NEXT: ret i64 undef ; -; CHECK: define internal i64 @foo.2 +; CHECK: define internal i64 @foo.specialized.2 ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %minus ; CHECK: minus: -; CHECK-NEXT: [[N1:%.*]] = call i64 @binop.3(i64 4, i64 -1) -; CHECK-NEXT: [[RES1:%.*]] = call i64 @bar.5(i64 3) +; CHECK-NEXT: [[N1:%.*]] = call i64 @binop.specialized.3(i64 4, i64 -1) +; CHECK-NEXT: [[RES1:%.*]] = call i64 @bar.specialized.5(i64 3) ; CHECK-NEXT: br label %merge ; CHECK: merge: ; CHECK-NEXT: ret i64 undef @@ -56,11 +56,11 @@ merge: define internal i64 @binop(i64 %x, i64 %y) { ; -; CHECK: define internal i64 @binop.3 +; CHECK: define internal i64 @binop.specialized.3 ; CHECK-NEXT: entry: ; CHECK-NEXT: ret i64 undef ; -; CHECK: define internal i64 @binop.4 +; CHECK: define internal i64 @binop.specialized.4 ; CHECK-NEXT: entry: ; CHECK-NEXT: ret i64 undef ; @@ -71,7 +71,7 @@ entry: define internal i64 @bar(i64 %n) { ; -; CHECK: define internal i64 @bar.5 +; CHECK: define internal i64 @bar.specialized.5 ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %if.else ; CHECK: if.else: @@ -79,7 +79,7 @@ define internal i64 @bar(i64 %n) { ; CHECK: if.end: ; CHECK-NEXT: ret i64 undef ; -; CHECK: define internal i64 @bar.6 +; CHECK: define internal i64 @bar.specialized.6 ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %if.then ; CHECK: if.then: diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll index 3e2adcbb71c5b..7ec4b0dbb0a12 100644 --- a/llvm/test/Transforms/InstCombine/and.ll +++ b/llvm/test/Transforms/InstCombine/and.ll @@ -2440,3 +2440,102 @@ define i64 @test_and_or_constexpr_infloop() { %or = or i64 %and, 1 ret i64 %or } + +define i32 @and_zext(i32 %a, i1 %b) { +; CHECK-LABEL: @and_zext( +; CHECK-NEXT: [[MASK:%.*]] = zext i1 [[B:%.*]] to i32 +; CHECK-NEXT: [[R:%.*]] = and i32 [[MASK]], [[A:%.*]] +; CHECK-NEXT: ret i32 [[R]] +; + %mask = zext i1 %b to i32 + %r = and i32 %a, %mask + ret i32 %r +} + +define i32 @and_zext_commuted(i32 %a, i1 %b) { +; CHECK-LABEL: @and_zext_commuted( +; CHECK-NEXT: [[MASK:%.*]] = zext i1 [[B:%.*]] to i32 +; CHECK-NEXT: [[R:%.*]] = and i32 [[MASK]], [[A:%.*]] +; CHECK-NEXT: ret i32 [[R]] +; + %mask = zext i1 %b to i32 + %r = and i32 %mask, %a + ret i32 %r +} + +define i32 @and_zext_multiuse(i32 %a, i1 %b) { +; CHECK-LABEL: @and_zext_multiuse( +; CHECK-NEXT: [[MASK:%.*]] = zext i1 [[B:%.*]] to i32 +; CHECK-NEXT: call void @use32(i32 [[MASK]]) +; CHECK-NEXT: [[R:%.*]] = and i32 [[MASK]], [[A:%.*]] +; CHECK-NEXT: ret i32 [[R]] +; + %mask = zext i1 %b to i32 + call void @use32(i32 %mask) + %r = and i32 %a, %mask + ret i32 %r +} + +define <2 x i32> @and_zext_vec(<2 x i32> %a, <2 x i1> %b) { +; CHECK-LABEL: @and_zext_vec( +; CHECK-NEXT: [[MASK:%.*]] = zext <2 x i1> [[B:%.*]] to <2 x i32> +; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[MASK]], [[A:%.*]] +; CHECK-NEXT: ret <2 x i32> [[R]] +; + %mask = zext <2 x i1> %b to <2 x i32> + %r = and <2 x i32> %a, %mask + ret <2 x i32> %r +} + +; tests from PR66606 +define i32 @and_zext_eq_even(i32 %a) { +; CHECK-LABEL: @and_zext_eq_even( +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 2 +; CHECK-NEXT: [[NOT:%.*]] = zext i1 [[COND]] to i32 +; CHECK-NEXT: [[R:%.*]] = and i32 [[NOT]], [[A]] +; CHECK-NEXT: ret i32 [[R]] +; + %cond = icmp eq i32 %a, 2 + %not = zext i1 %cond to i32 + %r = and i32 %a, %not + ret i32 %r +} + +define i32 @and_zext_eq_even_commuted(i32 %a) { +; CHECK-LABEL: @and_zext_eq_even_commuted( +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 2 +; CHECK-NEXT: [[NOT:%.*]] = zext i1 [[COND]] to i32 +; CHECK-NEXT: [[R:%.*]] = and i32 [[NOT]], [[A]] +; CHECK-NEXT: ret i32 [[R]] +; + %cond = icmp eq i32 %a, 2 + %not = zext i1 %cond to i32 + %r = and i32 %not, %a + ret i32 %r +} + +define i32 @and_zext_eq_odd(i32 %a) { +; CHECK-LABEL: @and_zext_eq_odd( +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 3 +; CHECK-NEXT: [[NOT:%.*]] = zext i1 [[COND]] to i32 +; CHECK-NEXT: [[R:%.*]] = and i32 [[NOT]], [[A]] +; CHECK-NEXT: ret i32 [[R]] +; + %cond = icmp eq i32 %a, 3 + %not = zext i1 %cond to i32 + %r = and i32 %a, %not + ret i32 %r +} + +define i32 @and_zext_eq_odd_commuted(i32 %a) { +; CHECK-LABEL: @and_zext_eq_odd_commuted( +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 3 +; CHECK-NEXT: [[NOT:%.*]] = zext i1 [[COND]] to i32 +; CHECK-NEXT: [[R:%.*]] = and i32 [[NOT]], [[A]] +; CHECK-NEXT: ret i32 [[R]] +; + %cond = icmp eq i32 %a, 3 + %not = zext i1 %cond to i32 + %r = and i32 %not, %a + ret i32 %r +} diff --git a/llvm/test/Transforms/InstCombine/bit_ceil.ll b/llvm/test/Transforms/InstCombine/bit_ceil.ll index 6f714153a598a..52e70c78ba542 100644 --- a/llvm/test/Transforms/InstCombine/bit_ceil.ll +++ b/llvm/test/Transforms/InstCombine/bit_ceil.ll @@ -148,10 +148,9 @@ define i32 @bit_ceil_commuted_operands(i32 %x) { ; CHECK-LABEL: @bit_ceil_commuted_operands( ; CHECK-NEXT: [[DEC:%.*]] = add i32 [[X:%.*]], -1 ; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[DEC]], i1 false), !range [[RNG0]] -; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i32 0, [[CTLZ]] -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 31 -; CHECK-NEXT: [[SEL:%.*]] = shl nuw i32 1, [[TMP2]] -; CHECK-NEXT: ret i32 [[SEL]] +; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]] +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[SUB]] +; CHECK-NEXT: ret i32 [[SHL]] ; %dec = add i32 %x, -1 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %dec, i1 false) diff --git a/llvm/test/Transforms/InstCombine/ctpop.ll b/llvm/test/Transforms/InstCombine/ctpop.ll index cf1d72383e694..f3419768bbd02 100644 --- a/llvm/test/Transforms/InstCombine/ctpop.ll +++ b/llvm/test/Transforms/InstCombine/ctpop.ll @@ -475,3 +475,14 @@ define i32 @parity_xor_extra_use2(i32 %arg, i32 %arg1) { %i5 = xor i32 %i2, %i4 ret i32 %i5 } + +define i32 @select_ctpop_zero(i32 %x) { +; CHECK-LABEL: @select_ctpop_zero( +; CHECK-NEXT: [[CTPOP:%.*]] = call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG1]] +; CHECK-NEXT: ret i32 [[CTPOP]] +; + %ctpop = call i32 @llvm.ctpop.i32(i32 %x) + %cmp = icmp eq i32 %x, 0 + %res = select i1 %cmp, i32 0, i32 %ctpop + ret i32 %res +} diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll index a67a3fde1e661..1fa0d4c4af054 100644 --- a/llvm/test/Transforms/InstCombine/ispow2.ll +++ b/llvm/test/Transforms/InstCombine/ispow2.ll @@ -345,9 +345,7 @@ define i1 @is_pow2_ctpop_wrong_pred1_logical(i32 %x) { ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred1_logical( ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 2 -; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 [[CMP]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) %cmp = icmp ugt i32 %t0, 2 diff --git a/llvm/test/Transforms/InstSimplify/and-or-icmp-zero.ll b/llvm/test/Transforms/InstSimplify/and-or-icmp-zero.ll index 3e50a5968b460..d4e0d776391ae 100644 --- a/llvm/test/Transforms/InstSimplify/and-or-icmp-zero.ll +++ b/llvm/test/Transforms/InstSimplify/and-or-icmp-zero.ll @@ -261,30 +261,3 @@ define i1 @and_cmps_ptr_eq_zero_with_mask_commute4(ptr %p, i64 %y) { %r = and i1 %isnotnull, %somebits_are_not_zero ret i1 %r } - -; tests from PR66606 -define i32 @and_zext_eq_zero(i32 %a) { -; CHECK-LABEL: @and_zext_eq_zero( -; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0 -; CHECK-NEXT: [[NOT:%.*]] = zext i1 [[COND]] to i32 -; CHECK-NEXT: [[R:%.*]] = and i32 [[A]], [[NOT]] -; CHECK-NEXT: ret i32 [[R]] -; - %cond = icmp eq i32 %a, 0 - %not = zext i1 %cond to i32 - %r = and i32 %a, %not - ret i32 %r -} - -define i32 @and_zext_eq_zero_commuted(i32 %a) { -; CHECK-LABEL: @and_zext_eq_zero_commuted( -; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0 -; CHECK-NEXT: [[NOT:%.*]] = zext i1 [[COND]] to i32 -; CHECK-NEXT: [[R:%.*]] = and i32 [[NOT]], [[A]] -; CHECK-NEXT: ret i32 [[R]] -; - %cond = icmp eq i32 %a, 0 - %not = zext i1 %cond to i32 - %r = and i32 %not, %a - ret i32 %r -} diff --git a/llvm/test/Transforms/LoopPredication/pr66382.ll b/llvm/test/Transforms/LoopPredication/pr66382.ll index 3ac4cac0615f4..f9a14d470453c 100644 --- a/llvm/test/Transforms/LoopPredication/pr66382.ll +++ b/llvm/test/Transforms/LoopPredication/pr66382.ll @@ -1,4 +1,4 @@ -; XFAIL: * +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 ; RUN: opt -S -loop-predication-skip-profitability-checks=false -passes='require,loop-mssa(loop-predication)' %s | FileCheck %s target triple = "x86_64-unknown-linux-gnu" @@ -6,7 +6,26 @@ target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nocallback nofree nosync willreturn declare void @llvm.experimental.guard(i1, ...) #0 +; Check that LoopPredication doesn't crash on all-zero branch weights define void @foo() { +; CHECK-LABEL: define void @foo() { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: Header: +; CHECK-NEXT: [[J2:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[J_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false, i32 0) [ "deopt"() ] +; CHECK-NEXT: [[J_NEXT]] = add i64 [[J2]], 1 +; CHECK-NEXT: br i1 false, label [[LATCH]], label [[EXIT:%.*]] +; CHECK: Latch: +; CHECK-NEXT: [[SPECULATE_TRIP_COUNT:%.*]] = icmp ult i64 [[J2]], 0 +; CHECK-NEXT: br i1 [[SPECULATE_TRIP_COUNT]], label [[HEADER]], label [[COMMON_RET_LOOPEXIT:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: common.ret.loopexit: +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: br label [[COMMON_RET]] +; entry: br label %Header diff --git a/llvm/test/Transforms/LoopPredication/scale.ll b/llvm/test/Transforms/LoopPredication/scale.ll new file mode 100644 index 0000000000000..29e48cf579598 --- /dev/null +++ b/llvm/test/Transforms/LoopPredication/scale.ll @@ -0,0 +1,259 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -S -loop-predication-skip-profitability-checks=false -passes='require,loop-mssa(loop-predication)' -verify-memoryssa -loop-predication-latch-probability-scale=2 %s 2>&1 | FileCheck %s --check-prefixes=CHECK-PROF +; RUN: opt -S -loop-predication-skip-profitability-checks=false -passes='require,loop-mssa(loop-predication)' -verify-memoryssa -loop-predication-latch-probability-scale=1.9 %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOTPROF + +; LatchExitProbability: 0x20000000 / 0x80000000 = 25.00% +; ExitingBlockProbability: 0x40000000 / 0x80000000 = 50.00% +; Predicate is profitable when the scale factor is 2 and not profitable if it's less than 2. +define i64 @predicate_eq_ones(ptr nocapture readonly %arg, i32 %length, ptr nocapture readonly %arg2, ptr nocapture readonly %n_addr, i64 %i) !prof !21 { +; CHECK-PROF-LABEL: define i64 @predicate_eq_ones( +; CHECK-PROF-SAME: ptr nocapture readonly [[ARG:%.*]], i32 [[LENGTH:%.*]], ptr nocapture readonly [[ARG2:%.*]], ptr nocapture readonly [[N_ADDR:%.*]], i64 [[I:%.*]]) !prof [[PROF0:![0-9]+]] { +; CHECK-PROF-NEXT: entry: +; CHECK-PROF-NEXT: [[LENGTH_EXT:%.*]] = zext i32 [[LENGTH]] to i64 +; CHECK-PROF-NEXT: [[N_PRE:%.*]] = load i64, ptr [[N_ADDR]], align 4 +; CHECK-PROF-NEXT: [[TMP0:%.*]] = icmp ule i64 1048576, [[LENGTH_EXT]] +; CHECK-PROF-NEXT: [[TMP1:%.*]] = icmp ult i64 0, [[LENGTH_EXT]] +; CHECK-PROF-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +; CHECK-PROF-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] +; CHECK-PROF-NEXT: br label [[HEADER:%.*]] +; CHECK-PROF: Header: +; CHECK-PROF-NEXT: [[RESULT_IN3:%.*]] = phi ptr [ [[ARG2]], [[ENTRY:%.*]] ], [ [[ARG]], [[LATCH:%.*]] ] +; CHECK-PROF-NEXT: [[J2:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LATCH]] ] +; CHECK-PROF-NEXT: [[WITHIN_BOUNDS:%.*]] = icmp ult i64 [[J2]], [[LENGTH_EXT]] +; CHECK-PROF-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[TMP3]], i32 9) [ "deopt"() ] +; CHECK-PROF-NEXT: call void @llvm.assume(i1 [[WITHIN_BOUNDS]]) +; CHECK-PROF-NEXT: [[INNERCMP:%.*]] = icmp eq i64 [[J2]], [[N_PRE]] +; CHECK-PROF-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J2]], 1 +; CHECK-PROF-NEXT: br i1 [[INNERCMP]], label [[LATCH]], label [[EXIT:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK-PROF: Latch: +; CHECK-PROF-NEXT: [[SPECULATE_TRIP_COUNT:%.*]] = icmp ult i64 [[J_NEXT]], 1048576 +; CHECK-PROF-NEXT: br i1 [[SPECULATE_TRIP_COUNT]], label [[HEADER]], label [[EXITLATCH:%.*]], !prof [[PROF2:![0-9]+]] +; CHECK-PROF: exitLatch: +; CHECK-PROF-NEXT: ret i64 1 +; CHECK-PROF: exit: +; CHECK-PROF-NEXT: [[RESULT_IN3_LCSSA:%.*]] = phi ptr [ [[RESULT_IN3]], [[HEADER]] ] +; CHECK-PROF-NEXT: [[RESULT_LE:%.*]] = load i64, ptr [[RESULT_IN3_LCSSA]], align 8 +; CHECK-PROF-NEXT: ret i64 [[RESULT_LE]] +; +; CHECK-NOTPROF-LABEL: define i64 @predicate_eq_ones( +; CHECK-NOTPROF-SAME: ptr nocapture readonly [[ARG:%.*]], i32 [[LENGTH:%.*]], ptr nocapture readonly [[ARG2:%.*]], ptr nocapture readonly [[N_ADDR:%.*]], i64 [[I:%.*]]) !prof [[PROF0:![0-9]+]] { +; CHECK-NOTPROF-NEXT: entry: +; CHECK-NOTPROF-NEXT: [[LENGTH_EXT:%.*]] = zext i32 [[LENGTH]] to i64 +; CHECK-NOTPROF-NEXT: [[N_PRE:%.*]] = load i64, ptr [[N_ADDR]], align 4 +; CHECK-NOTPROF-NEXT: br label [[HEADER:%.*]] +; CHECK-NOTPROF: Header: +; CHECK-NOTPROF-NEXT: [[RESULT_IN3:%.*]] = phi ptr [ [[ARG2]], [[ENTRY:%.*]] ], [ [[ARG]], [[LATCH:%.*]] ] +; CHECK-NOTPROF-NEXT: [[J2:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LATCH]] ] +; CHECK-NOTPROF-NEXT: [[WITHIN_BOUNDS:%.*]] = icmp ult i64 [[J2]], [[LENGTH_EXT]] +; CHECK-NOTPROF-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[WITHIN_BOUNDS]], i32 9) [ "deopt"() ] +; CHECK-NOTPROF-NEXT: [[INNERCMP:%.*]] = icmp eq i64 [[J2]], [[N_PRE]] +; CHECK-NOTPROF-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J2]], 1 +; CHECK-NOTPROF-NEXT: br i1 [[INNERCMP]], label [[LATCH]], label [[EXIT:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK-NOTPROF: Latch: +; CHECK-NOTPROF-NEXT: [[SPECULATE_TRIP_COUNT:%.*]] = icmp ult i64 [[J_NEXT]], 1048576 +; CHECK-NOTPROF-NEXT: br i1 [[SPECULATE_TRIP_COUNT]], label [[HEADER]], label [[EXITLATCH:%.*]], !prof [[PROF2:![0-9]+]] +; CHECK-NOTPROF: exitLatch: +; CHECK-NOTPROF-NEXT: ret i64 1 +; CHECK-NOTPROF: exit: +; CHECK-NOTPROF-NEXT: [[RESULT_IN3_LCSSA:%.*]] = phi ptr [ [[RESULT_IN3]], [[HEADER]] ] +; CHECK-NOTPROF-NEXT: [[RESULT_LE:%.*]] = load i64, ptr [[RESULT_IN3_LCSSA]], align 8 +; CHECK-NOTPROF-NEXT: ret i64 [[RESULT_LE]] +; +entry: + %length.ext = zext i32 %length to i64 + %n.pre = load i64, ptr %n_addr, align 4 + br label %Header + +Header: ; preds = %entry, %Latch + %result.in3 = phi ptr [ %arg2, %entry ], [ %arg, %Latch ] + %j2 = phi i64 [ 0, %entry ], [ %j.next, %Latch ] + %within.bounds = icmp ult i64 %j2, %length.ext + call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ] + %innercmp = icmp eq i64 %j2, %n.pre + %j.next = add nuw nsw i64 %j2, 1 + br i1 %innercmp, label %Latch, label %exit, !prof !0 + +Latch: ; preds = %Header + %speculate_trip_count = icmp ult i64 %j.next, 1048576 + br i1 %speculate_trip_count, label %Header, label %exitLatch, !prof !2 + +exitLatch: ; preds = %Latch + ret i64 1 + +exit: ; preds = %Header + %result.in3.lcssa = phi ptr [ %result.in3, %Header ] + %result.le = load i64, ptr %result.in3.lcssa, align 8 + ret i64 %result.le +} +!0 = !{!"branch_weights", i32 1, i32 1} + +; Same as the previous one, but with zero weights (should be treated as if no profile - equal probability) +define i64 @predicate_eq_zeroes(ptr nocapture readonly %arg, i32 %length, ptr nocapture readonly %arg2, ptr nocapture readonly %n_addr, i64 %i) !prof !21 { +; CHECK-PROF-LABEL: define i64 @predicate_eq_zeroes( +; CHECK-PROF-SAME: ptr nocapture readonly [[ARG:%.*]], i32 [[LENGTH:%.*]], ptr nocapture readonly [[ARG2:%.*]], ptr nocapture readonly [[N_ADDR:%.*]], i64 [[I:%.*]]) !prof [[PROF0]] { +; CHECK-PROF-NEXT: entry: +; CHECK-PROF-NEXT: [[LENGTH_EXT:%.*]] = zext i32 [[LENGTH]] to i64 +; CHECK-PROF-NEXT: [[N_PRE:%.*]] = load i64, ptr [[N_ADDR]], align 4 +; CHECK-PROF-NEXT: [[TMP0:%.*]] = icmp ule i64 1048576, [[LENGTH_EXT]] +; CHECK-PROF-NEXT: [[TMP1:%.*]] = icmp ult i64 0, [[LENGTH_EXT]] +; CHECK-PROF-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +; CHECK-PROF-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] +; CHECK-PROF-NEXT: br label [[HEADER:%.*]] +; CHECK-PROF: Header: +; CHECK-PROF-NEXT: [[RESULT_IN3:%.*]] = phi ptr [ [[ARG2]], [[ENTRY:%.*]] ], [ [[ARG]], [[LATCH:%.*]] ] +; CHECK-PROF-NEXT: [[J2:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LATCH]] ] +; CHECK-PROF-NEXT: [[WITHIN_BOUNDS:%.*]] = icmp ult i64 [[J2]], [[LENGTH_EXT]] +; CHECK-PROF-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[TMP3]], i32 9) [ "deopt"() ] +; CHECK-PROF-NEXT: call void @llvm.assume(i1 [[WITHIN_BOUNDS]]) +; CHECK-PROF-NEXT: [[INNERCMP:%.*]] = icmp eq i64 [[J2]], [[N_PRE]] +; CHECK-PROF-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J2]], 1 +; CHECK-PROF-NEXT: br i1 [[INNERCMP]], label [[LATCH]], label [[EXIT:%.*]], !prof [[PROF3:![0-9]+]] +; CHECK-PROF: Latch: +; CHECK-PROF-NEXT: [[SPECULATE_TRIP_COUNT:%.*]] = icmp ult i64 [[J_NEXT]], 1048576 +; CHECK-PROF-NEXT: br i1 [[SPECULATE_TRIP_COUNT]], label [[HEADER]], label [[EXITLATCH:%.*]], !prof [[PROF2]] +; CHECK-PROF: exitLatch: +; CHECK-PROF-NEXT: ret i64 1 +; CHECK-PROF: exit: +; CHECK-PROF-NEXT: [[RESULT_IN3_LCSSA:%.*]] = phi ptr [ [[RESULT_IN3]], [[HEADER]] ] +; CHECK-PROF-NEXT: [[RESULT_LE:%.*]] = load i64, ptr [[RESULT_IN3_LCSSA]], align 8 +; CHECK-PROF-NEXT: ret i64 [[RESULT_LE]] +; +; CHECK-NOTPROF-LABEL: define i64 @predicate_eq_zeroes( +; CHECK-NOTPROF-SAME: ptr nocapture readonly [[ARG:%.*]], i32 [[LENGTH:%.*]], ptr nocapture readonly [[ARG2:%.*]], ptr nocapture readonly [[N_ADDR:%.*]], i64 [[I:%.*]]) !prof [[PROF0]] { +; CHECK-NOTPROF-NEXT: entry: +; CHECK-NOTPROF-NEXT: [[LENGTH_EXT:%.*]] = zext i32 [[LENGTH]] to i64 +; CHECK-NOTPROF-NEXT: [[N_PRE:%.*]] = load i64, ptr [[N_ADDR]], align 4 +; CHECK-NOTPROF-NEXT: br label [[HEADER:%.*]] +; CHECK-NOTPROF: Header: +; CHECK-NOTPROF-NEXT: [[RESULT_IN3:%.*]] = phi ptr [ [[ARG2]], [[ENTRY:%.*]] ], [ [[ARG]], [[LATCH:%.*]] ] +; CHECK-NOTPROF-NEXT: [[J2:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LATCH]] ] +; CHECK-NOTPROF-NEXT: [[WITHIN_BOUNDS:%.*]] = icmp ult i64 [[J2]], [[LENGTH_EXT]] +; CHECK-NOTPROF-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[WITHIN_BOUNDS]], i32 9) [ "deopt"() ] +; CHECK-NOTPROF-NEXT: [[INNERCMP:%.*]] = icmp eq i64 [[J2]], [[N_PRE]] +; CHECK-NOTPROF-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J2]], 1 +; CHECK-NOTPROF-NEXT: br i1 [[INNERCMP]], label [[LATCH]], label [[EXIT:%.*]], !prof [[PROF3:![0-9]+]] +; CHECK-NOTPROF: Latch: +; CHECK-NOTPROF-NEXT: [[SPECULATE_TRIP_COUNT:%.*]] = icmp ult i64 [[J_NEXT]], 1048576 +; CHECK-NOTPROF-NEXT: br i1 [[SPECULATE_TRIP_COUNT]], label [[HEADER]], label [[EXITLATCH:%.*]], !prof [[PROF2]] +; CHECK-NOTPROF: exitLatch: +; CHECK-NOTPROF-NEXT: ret i64 1 +; CHECK-NOTPROF: exit: +; CHECK-NOTPROF-NEXT: [[RESULT_IN3_LCSSA:%.*]] = phi ptr [ [[RESULT_IN3]], [[HEADER]] ] +; CHECK-NOTPROF-NEXT: [[RESULT_LE:%.*]] = load i64, ptr [[RESULT_IN3_LCSSA]], align 8 +; CHECK-NOTPROF-NEXT: ret i64 [[RESULT_LE]] +; +entry: + %length.ext = zext i32 %length to i64 + %n.pre = load i64, ptr %n_addr, align 4 + br label %Header + +Header: ; preds = %entry, %Latch + %result.in3 = phi ptr [ %arg2, %entry ], [ %arg, %Latch ] + %j2 = phi i64 [ 0, %entry ], [ %j.next, %Latch ] + %within.bounds = icmp ult i64 %j2, %length.ext + call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ] + %innercmp = icmp eq i64 %j2, %n.pre + %j.next = add nuw nsw i64 %j2, 1 + br i1 %innercmp, label %Latch, label %exit, !prof !1 + +Latch: ; preds = %Header + %speculate_trip_count = icmp ult i64 %j.next, 1048576 + br i1 %speculate_trip_count, label %Header, label %exitLatch, !prof !2 + +exitLatch: ; preds = %Latch + ret i64 1 + +exit: ; preds = %Header + %result.in3.lcssa = phi ptr [ %result.in3, %Header ] + %result.le = load i64, ptr %result.in3.lcssa, align 8 + ret i64 %result.le +} +!1 = !{!"branch_weights", i32 0, i32 0} + +; No profile on br in Header +define i64 @predicate_eq_none(ptr nocapture readonly %arg, i32 %length, ptr nocapture readonly %arg2, ptr nocapture readonly %n_addr, i64 %i) !prof !21 { +; CHECK-PROF-LABEL: define i64 @predicate_eq_none( +; CHECK-PROF-SAME: ptr nocapture readonly [[ARG:%.*]], i32 [[LENGTH:%.*]], ptr nocapture readonly [[ARG2:%.*]], ptr nocapture readonly [[N_ADDR:%.*]], i64 [[I:%.*]]) !prof [[PROF0]] { +; CHECK-PROF-NEXT: entry: +; CHECK-PROF-NEXT: [[LENGTH_EXT:%.*]] = zext i32 [[LENGTH]] to i64 +; CHECK-PROF-NEXT: [[N_PRE:%.*]] = load i64, ptr [[N_ADDR]], align 4 +; CHECK-PROF-NEXT: [[TMP0:%.*]] = icmp ule i64 1048576, [[LENGTH_EXT]] +; CHECK-PROF-NEXT: [[TMP1:%.*]] = icmp ult i64 0, [[LENGTH_EXT]] +; CHECK-PROF-NEXT: [[TMP2:%.*]] = and i1 [[TMP1]], [[TMP0]] +; CHECK-PROF-NEXT: [[TMP3:%.*]] = freeze i1 [[TMP2]] +; CHECK-PROF-NEXT: br label [[HEADER:%.*]] +; CHECK-PROF: Header: +; CHECK-PROF-NEXT: [[RESULT_IN3:%.*]] = phi ptr [ [[ARG2]], [[ENTRY:%.*]] ], [ [[ARG]], [[LATCH:%.*]] ] +; CHECK-PROF-NEXT: [[J2:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LATCH]] ] +; CHECK-PROF-NEXT: [[WITHIN_BOUNDS:%.*]] = icmp ult i64 [[J2]], [[LENGTH_EXT]] +; CHECK-PROF-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[TMP3]], i32 9) [ "deopt"() ] +; CHECK-PROF-NEXT: call void @llvm.assume(i1 [[WITHIN_BOUNDS]]) +; CHECK-PROF-NEXT: [[INNERCMP:%.*]] = icmp eq i64 [[J2]], [[N_PRE]] +; CHECK-PROF-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J2]], 1 +; CHECK-PROF-NEXT: br i1 [[INNERCMP]], label [[LATCH]], label [[EXIT:%.*]] +; CHECK-PROF: Latch: +; CHECK-PROF-NEXT: [[SPECULATE_TRIP_COUNT:%.*]] = icmp ult i64 [[J_NEXT]], 1048576 +; CHECK-PROF-NEXT: br i1 [[SPECULATE_TRIP_COUNT]], label [[HEADER]], label [[EXITLATCH:%.*]], !prof [[PROF2]] +; CHECK-PROF: exitLatch: +; CHECK-PROF-NEXT: ret i64 1 +; CHECK-PROF: exit: +; CHECK-PROF-NEXT: [[RESULT_IN3_LCSSA:%.*]] = phi ptr [ [[RESULT_IN3]], [[HEADER]] ] +; CHECK-PROF-NEXT: [[RESULT_LE:%.*]] = load i64, ptr [[RESULT_IN3_LCSSA]], align 8 +; CHECK-PROF-NEXT: ret i64 [[RESULT_LE]] +; +; CHECK-NOTPROF-LABEL: define i64 @predicate_eq_none( +; CHECK-NOTPROF-SAME: ptr nocapture readonly [[ARG:%.*]], i32 [[LENGTH:%.*]], ptr nocapture readonly [[ARG2:%.*]], ptr nocapture readonly [[N_ADDR:%.*]], i64 [[I:%.*]]) !prof [[PROF0]] { +; CHECK-NOTPROF-NEXT: entry: +; CHECK-NOTPROF-NEXT: [[LENGTH_EXT:%.*]] = zext i32 [[LENGTH]] to i64 +; CHECK-NOTPROF-NEXT: [[N_PRE:%.*]] = load i64, ptr [[N_ADDR]], align 4 +; CHECK-NOTPROF-NEXT: br label [[HEADER:%.*]] +; CHECK-NOTPROF: Header: +; CHECK-NOTPROF-NEXT: [[RESULT_IN3:%.*]] = phi ptr [ [[ARG2]], [[ENTRY:%.*]] ], [ [[ARG]], [[LATCH:%.*]] ] +; CHECK-NOTPROF-NEXT: [[J2:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LATCH]] ] +; CHECK-NOTPROF-NEXT: [[WITHIN_BOUNDS:%.*]] = icmp ult i64 [[J2]], [[LENGTH_EXT]] +; CHECK-NOTPROF-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[WITHIN_BOUNDS]], i32 9) [ "deopt"() ] +; CHECK-NOTPROF-NEXT: [[INNERCMP:%.*]] = icmp eq i64 [[J2]], [[N_PRE]] +; CHECK-NOTPROF-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J2]], 1 +; CHECK-NOTPROF-NEXT: br i1 [[INNERCMP]], label [[LATCH]], label [[EXIT:%.*]] +; CHECK-NOTPROF: Latch: +; CHECK-NOTPROF-NEXT: [[SPECULATE_TRIP_COUNT:%.*]] = icmp ult i64 [[J_NEXT]], 1048576 +; CHECK-NOTPROF-NEXT: br i1 [[SPECULATE_TRIP_COUNT]], label [[HEADER]], label [[EXITLATCH:%.*]], !prof [[PROF2]] +; CHECK-NOTPROF: exitLatch: +; CHECK-NOTPROF-NEXT: ret i64 1 +; CHECK-NOTPROF: exit: +; CHECK-NOTPROF-NEXT: [[RESULT_IN3_LCSSA:%.*]] = phi ptr [ [[RESULT_IN3]], [[HEADER]] ] +; CHECK-NOTPROF-NEXT: [[RESULT_LE:%.*]] = load i64, ptr [[RESULT_IN3_LCSSA]], align 8 +; CHECK-NOTPROF-NEXT: ret i64 [[RESULT_LE]] +; +entry: + %length.ext = zext i32 %length to i64 + %n.pre = load i64, ptr %n_addr, align 4 + br label %Header + +Header: ; preds = %entry, %Latch + %result.in3 = phi ptr [ %arg2, %entry ], [ %arg, %Latch ] + %j2 = phi i64 [ 0, %entry ], [ %j.next, %Latch ] + %within.bounds = icmp ult i64 %j2, %length.ext + call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ] + %innercmp = icmp eq i64 %j2, %n.pre + %j.next = add nuw nsw i64 %j2, 1 + br i1 %innercmp, label %Latch, label %exit + +Latch: ; preds = %Header + %speculate_trip_count = icmp ult i64 %j.next, 1048576 + br i1 %speculate_trip_count, label %Header, label %exitLatch, !prof !2 + +exitLatch: ; preds = %Latch + ret i64 1 + +exit: ; preds = %Header + %result.in3.lcssa = phi ptr [ %result.in3, %Header ] + %result.le = load i64, ptr %result.in3.lcssa, align 8 + ret i64 %result.le +} + +!2 = !{!"branch_weights", i32 3, i32 1} +!21 = !{!"function_entry_count", i64 20000} + +declare i64 @llvm.experimental.deoptimize.i64(...) +declare void @llvm.experimental.guard(i1, ...) diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index aca02f37abe2e..18b05c05d9b9d 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -1122,12 +1122,10 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i8> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = udiv <4 x i8> [[TMP3]], -; CHECK-NEXT: [[NARROW:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i8> zeroinitializer, <4 x i8> [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[NARROW]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/NewGVN/assume_dominating_icmp.ll b/llvm/test/Transforms/NewGVN/assume_dominating_icmp.ll index 1bf9f65535c3c..2294f1e1ac85b 100644 --- a/llvm/test/Transforms/NewGVN/assume_dominating_icmp.ll +++ b/llvm/test/Transforms/NewGVN/assume_dominating_icmp.ll @@ -4,61 +4,70 @@ @c = global i32 0, align 4 ; Function Attrs: nounwind optsize uwtable -define dso_local i32 @main(i1 %cond, i32 %0, i32 %1) { -; CHECK-LABEL: define dso_local i32 @main( -; CHECK-SAME: i1 [[COND:%.*]], i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +define i32 @main(i1 %cond1, i32 %arg0, i32 %arg1) { +; CHECK-LABEL: define i32 @main( +; CHECK-SAME: i1 [[COND1:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END6:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP0]], 1 -; CHECK-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_END6]], label [[IF_THEN2:%.*]] -; CHECK: if.then2: -; CHECK-NEXT: [[TOBOOL3_NOT:%.*]] = icmp ne i32 [[XOR]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TOBOOL3_NOT]]) -; CHECK-NEXT: br label [[IF_END6]] -; CHECK: if.end6: -; CHECK-NEXT: [[F_0:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[XOR]], [[IF_THEN]] ], [ [[XOR]], [[IF_THEN2]] ] -; CHECK-NEXT: [[NOT:%.*]] = xor i32 [[F_0]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @c, align 4 -; CHECK-NEXT: [[OR:%.*]] = or i32 [[TMP2]], [[NOT]] -; CHECK-NEXT: [[TOBOOL7_NOT:%.*]] = icmp eq i32 [[OR]], 0 -; CHECK-NEXT: [[TOBOOL9_NOT:%.*]] = icmp eq i32 [[F_0]], 0 -; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[TOBOOL7_NOT]], [[TOBOOL9_NOT]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_END10:%.*]], label [[WHILE_COND_PREHEADER:%.*]] -; CHECK: while.cond.preheader: -; CHECK-NEXT: ret i32 1 -; CHECK: if.end10: +; CHECK-NEXT: br i1 [[COND1]], label [[BB1:%.*]], label [[BB3:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[XOR1:%.*]] = xor i32 [[ARG0]], 1 +; CHECK-NEXT: [[COND2:%.*]] = icmp eq i32 [[ARG1]], 0 +; CHECK-NEXT: br i1 [[COND2]], label [[BB3]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[COND3:%.*]] = icmp ne i32 [[XOR1]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[COND3]]) +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[XOR1]], [[BB1]] ], [ [[XOR1]], [[BB2]] ] +; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[PHI]], -1 +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr @c, align 4 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[LD]], [[XOR2]] +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[OR]], 0 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[PHI]], 0 +; CHECK-NEXT: [[COND4:%.*]] = or i1 [[CMP1]], [[CMP2]] +; CHECK-NEXT: br i1 [[COND4]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] +; CHECK: exit1: ; CHECK-NEXT: ret i32 0 +; CHECK: exit2: +; CHECK-NEXT: ret i32 1 ; entry: - br i1 %cond, label %if.then, label %if.end6 - -if.then: ; preds = %entry - %xor = xor i32 %0, 1 - %tobool1.not = icmp eq i32 %1, 0 - br i1 %tobool1.not, label %if.end6, label %if.then2 +; entry +; / | +; bb1 | +; / | | +; bb2 | | +; \| | +; bb3 +; / \ +; exit1 exit2 + br i1 %cond1, label %bb1, label %bb3 -if.then2: ; preds = %if.then - %tobool3.not = icmp ne i32 %xor, 0 - tail call void @llvm.assume(i1 %tobool3.not) - br label %if.end6 +bb1: ; preds = %entry + %xor1 = xor i32 %arg0, 1 + %cond2 = icmp eq i32 %arg1, 0 + br i1 %cond2, label %bb3, label %bb2 -if.end6: ; preds = %if.then2, %if.then, %entry - %f.0 = phi i32 [ undef, %entry ], [ %xor, %if.then ], [ %xor, %if.then2 ] - %not = xor i32 %f.0, -1 - %2 = load i32, ptr @c, align 4 - %or = or i32 %2, %not - %tobool7.not = icmp eq i32 %or, 0 - %tobool9.not = icmp eq i32 %f.0, 0 - %or.cond = or i1 %tobool7.not, %tobool9.not - br i1 %or.cond, label %if.end10, label %while.cond.preheader +bb2: ; preds = %bb1 + %cond3 = icmp ne i32 %xor1, 0 + tail call void @llvm.assume(i1 %cond3) + br label %bb3 -while.cond.preheader: ; preds = %if.end6 - ret i32 1 +bb3: ; preds = %bb2, %bb1, %entry + %phi = phi i32 [ undef, %entry ], [ %xor1, %bb1 ], [ %xor1, %bb2 ] + %xor2 = xor i32 %phi, -1 + %ld = load i32, ptr @c, align 4 + %or = or i32 %ld, %xor2 + %cmp1 = icmp eq i32 %or, 0 + %cmp2 = icmp eq i32 %phi, 0 + %cond4 = or i1 %cmp1, %cmp2 + br i1 %cond4, label %exit1, label %exit2 -if.end10: ; preds = %if.end6 +exit1: ; preds = %bb3 ret i32 0 + +exit2: ; preds = %bb3 + ret i32 1 } declare void @llvm.assume(i1 noundef) diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index 9c61fd1f13593..34a0eb7a20d3c 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -568,9 +568,14 @@ int main(int argc, char **argv) { // the facility for updating public visibility to linkage unit visibility when // specified by an internal option. This is normally done during LTO which is // not performed via opt. - updateVCallVisibilityInModule(*M, - /* WholeProgramVisibilityEnabledInLTO */ false, - /* DynamicExportSymbols */ {}); + updateVCallVisibilityInModule( + *M, + /*WholeProgramVisibilityEnabledInLTO=*/false, + // FIXME: These need linker information via a + // TBD new interface. + /*DynamicExportSymbols=*/{}, + /*ValidateAllVtablesHaveTypeInfos=*/false, + /*IsVisibleToRegularObj=*/[](StringRef) { return true; }); // Figure out what stream we are supposed to write to... std::unique_ptr Out; diff --git a/llvm/utils/lit/lit/LitConfig.py b/llvm/utils/lit/lit/LitConfig.py index 331c21d7b9dfc..d7e79b60f385b 100644 --- a/llvm/utils/lit/lit/LitConfig.py +++ b/llvm/utils/lit/lit/LitConfig.py @@ -36,7 +36,6 @@ def __init__( config_prefix=None, maxIndividualTestTime=0, parallelism_groups={}, - echo_all_commands=False, per_test_coverage=False, ): # The name of the test runner. @@ -87,7 +86,6 @@ def __init__( self.maxIndividualTestTime = maxIndividualTestTime self.parallelism_groups = parallelism_groups - self.echo_all_commands = echo_all_commands self.per_test_coverage = per_test_coverage @property diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index b2c50f563696f..528694b07f033 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -8,6 +8,7 @@ import stat import pathlib import platform +import shlex import shutil import tempfile import threading @@ -348,12 +349,12 @@ def executeBuiltinExport(cmd, shenv): def executeBuiltinEcho(cmd, shenv): - """Interpret a redirected echo command""" + """Interpret a redirected echo or @echo command""" opened_files = [] stdin, stdout, stderr = processRedirects(cmd, subprocess.PIPE, shenv, opened_files) if stdin != subprocess.PIPE or stderr != subprocess.PIPE: raise InternalShellError( - cmd, "stdin and stderr redirects not supported for echo" + cmd, f"stdin and stderr redirects not supported for {cmd.args[0]}" ) # Some tests have un-redirected echo commands to help debug test failures. @@ -700,6 +701,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): "cd": executeBuiltinCd, "export": executeBuiltinExport, "echo": executeBuiltinEcho, + "@echo": executeBuiltinEcho, "mkdir": executeBuiltinMkdir, "popd": executeBuiltinPopd, "pushd": executeBuiltinPushd, @@ -927,7 +929,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): if res == -signal.SIGINT: raise KeyboardInterrupt if proc_not_counts[i] % 2: - res = not res + res = 1 if res == 0 else 0 elif proc_not_counts[i] > 1: res = 1 if res != 0 else 0 @@ -990,19 +992,60 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): return exitCode -def executeScriptInternal(test, litConfig, tmpBase, commands, cwd): +def formatOutput(title, data, limit=None): + if not data.strip(): + return "" + if not limit is None and len(data) > limit: + data = data[:limit] + "\n...\n" + msg = "data was truncated" + else: + msg = "" + ndashes = 30 + # fmt: off + out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n" + out += f"# | " + "\n# | ".join(data.splitlines()) + "\n" + out += f"# `---{msg}{'-' * (ndashes - 4 - len(msg))}\n" + # fmt: on + return out + + +# Normally returns out, err, exitCode, timeoutInfo. +# +# If debug is True (the normal lit behavior), err is empty, and out contains an +# execution trace, including stdout and stderr shown per command executed. +# +# If debug is False (set by some custom lit test formats that call this +# function), out contains only stdout from the script, err contains only stderr +# from the script, and there is no execution trace. +def executeScriptInternal(test, litConfig, tmpBase, commands, cwd, debug=True): cmds = [] for i, ln in enumerate(commands): + # Within lit, we try to always add '%dbg(...)' to command lines in order + # to maximize debuggability. However, custom lit test formats might not + # always add it, so add a generic debug message in that case. match = re.fullmatch(kPdbgRegex, ln) if match: + dbg = match.group(1) command = match.group(2) - ln = commands[i] = match.expand(": '\\1'; \\2" if command else ": '\\1'") + else: + dbg = "command line" + command = ln + if debug: + ln = f"@echo '# {dbg}' " + if command: + ln += f"&& @echo {shlex.quote(command.lstrip())} && {command}" + else: + ln += "has no command after substitutions" + else: + ln = command try: cmds.append( ShUtil.ShParser(ln, litConfig.isWindows, test.config.pipefail).parse() ) except: - return lit.Test.Result(Test.FAIL, "shell parser error on: %r" % ln) + return lit.Test.Result( + Test.FAIL, f"shell parser error on {dbg}: {command.lstrip()}\n" + ) cmd = cmds[0] for c in cmds[1:]: @@ -1022,8 +1065,42 @@ def executeScriptInternal(test, litConfig, tmpBase, commands, cwd): out = err = "" for i, result in enumerate(results): - # Write the command line run. - out += "$ %s\n" % (" ".join('"%s"' % s for s in result.command.args),) + if not debug: + out += result.stdout + err += result.stderr + continue + + # The purpose of an "@echo" command is merely to add a debugging message + # directly to lit's output. It is used internally by lit's internal + # shell and is not currently documented for use in lit tests. However, + # if someone misuses it (e.g., both "echo" and "@echo" complain about + # stdin redirection), produce the normal execution trace to facilitate + # debugging. + if ( + result.command.args[0] == "@echo" + and result.exitCode == 0 + and not result.stderr + and not result.outputFiles + and not result.timeoutReached + ): + out += result.stdout + continue + + # Write the command line that was run. Properly quote it. Leading + # "!" commands should not be quoted as that would indicate they are not + # the builtins. + out += "# executed command: " + nLeadingBangs = next( + (i for i, cmd in enumerate(result.command.args) if cmd != "!"), + len(result.command.args), + ) + out += "! " * nLeadingBangs + out += " ".join( + shlex.quote(str(s)) + for i, s in enumerate(result.command.args) + if i >= nLeadingBangs + ) + out += "\n" # If nothing interesting happened, move on. if ( @@ -1038,22 +1115,14 @@ def executeScriptInternal(test, litConfig, tmpBase, commands, cwd): # Add the command output, if redirected. for (name, path, data) in result.outputFiles: - if data.strip(): - out += "# redirected output from %r:\n" % (name,) - data = to_string(data.decode("utf-8", errors="replace")) - if len(data) > 1024: - out += data[:1024] + "\n...\n" - out += "note: data was truncated\n" - else: - out += data - out += "\n" - + data = to_string(data.decode("utf-8", errors="replace")) + out += formatOutput(f"redirected output from '{name}'", data, limit=1024) if result.stdout.strip(): - out += "# command output:\n%s\n" % (result.stdout,) + out += formatOutput("command stdout", result.stdout) if result.stderr.strip(): - out += "# command stderr:\n%s\n" % (result.stderr,) + out += formatOutput("command stderr", result.stderr) if not result.stdout.strip() and not result.stderr.strip(): - out += "note: command had no output on stdout or stderr\n" + out += "# note: command had no output on stdout or stderr\n" # Show the error conditions: if result.exitCode != 0: @@ -1063,9 +1132,9 @@ def executeScriptInternal(test, litConfig, tmpBase, commands, cwd): codeStr = hex(int(result.exitCode & 0xFFFFFFFF)).rstrip("L") else: codeStr = str(result.exitCode) - out += "error: command failed with exit status: %s\n" % (codeStr,) + out += "# error: command failed with exit status: %s\n" % (codeStr,) if litConfig.maxIndividualTestTime > 0 and result.timeoutReached: - out += "error: command reached timeout: %s\n" % ( + out += "# error: command reached timeout: %s\n" % ( str(result.timeoutReached), ) @@ -1095,21 +1164,49 @@ def executeScript(test, litConfig, tmpBase, commands, cwd): commands[i] = match.expand( "echo '\\1' > nul && " if command else "echo '\\1' > nul" ) - if litConfig.echo_all_commands: - f.write("@echo on\n") - else: - f.write("@echo off\n") + f.write("@echo on\n") f.write("\n@if %ERRORLEVEL% NEQ 0 EXIT\n".join(commands)) else: for i, ln in enumerate(commands): match = re.fullmatch(kPdbgRegex, ln) if match: + dbg = match.group(1) command = match.group(2) - commands[i] = match.expand(": '\\1'; \\2" if command else ": '\\1'") + # Echo the debugging diagnostic to stderr. + # + # For that echo command, use 'set' commands to suppress the + # shell's execution trace, which would just add noise. Suppress + # the shell's execution trace for the 'set' commands by + # redirecting their stderr to /dev/null. + if command: + msg = f"'{dbg}': {shlex.quote(command.lstrip())}" + else: + msg = f"'{dbg}' has no command after substitutions" + commands[i] = ( + f"{{ set +x; }} 2>/dev/null && " + f"echo {msg} >&2 && " + f"{{ set -x; }} 2>/dev/null" + ) + # Execute the command, if any. + # + # 'command' might be something like: + # + # subcmd & PID=$! + # + # In that case, we need something like: + # + # echo_dbg && { subcmd & PID=$!; } + # + # Without the '{ ...; }' enclosing the original 'command', '&' + # would put all of 'echo_dbg && subcmd' in the background. This + # would cause 'echo_dbg' to execute at the wrong time, and a + # later kill of $PID would target the wrong process. We have + # seen the latter manage to terminate the shell running lit. + if command: + commands[i] += f" && {{ {command}; }}" if test.config.pipefail: f.write(b"set -o pipefail;" if mode == "wb" else "set -o pipefail;") - if litConfig.echo_all_commands: - f.write(b"set -x;" if mode == "wb" else "set -x;") + f.write(b"set -x;" if mode == "wb" else "set -x;") if sys.version_info > (3, 0) and mode == "wb": f.write(bytes("{ " + "; } &&\n{ ".join(commands) + "; }", "utf-8")) else: @@ -2035,6 +2132,11 @@ def parseIntegratedTestScript(test, additional_parsers=[], require_script=True): def _runShTest(test, litConfig, useExternalSh, script, tmpBase): def runOnce(execdir): + # script is modified below (for litConfig.per_test_coverage, and for + # %dbg expansions). runOnce can be called multiple times, but applying + # the modifications multiple times can corrupt script, so always modify + # a copy. + scriptCopy = script[:] # Set unique LLVM_PROFILE_FILE for each run command if litConfig.per_test_coverage: # Extract the test case name from the test object, and remove the @@ -2042,7 +2144,7 @@ def runOnce(execdir): test_case_name = test.path_in_suite[-1] test_case_name = test_case_name.rsplit(".", 1)[0] coverage_index = 0 # Counter for coverage file index - for i, ln in enumerate(script): + for i, ln in enumerate(scriptCopy): match = re.fullmatch(kPdbgRegex, ln) if match: dbg = match.group(1) @@ -2054,12 +2156,12 @@ def runOnce(execdir): command = f"export LLVM_PROFILE_FILE={profile}; {command}" if match: command = buildPdbgCommand(dbg, command) - script[i] = command + scriptCopy[i] = command if useExternalSh: - res = executeScript(test, litConfig, tmpBase, script, execdir) + res = executeScript(test, litConfig, tmpBase, scriptCopy, execdir) else: - res = executeScriptInternal(test, litConfig, tmpBase, script, execdir) + res = executeScriptInternal(test, litConfig, tmpBase, scriptCopy, execdir) if isinstance(res, lit.Test.Result): return res @@ -2079,14 +2181,7 @@ def runOnce(execdir): # Re-run failed tests up to test.allowed_retries times. execdir = os.path.dirname(test.getExecPath()) attempts = test.allowed_retries + 1 - scriptInit = script for i in range(attempts): - # runOnce modifies script, but applying the modifications again to the - # result can corrupt script, so we restore the original upon a retry. - # A cleaner solution would be for runOnce to encapsulate operating on a - # copy of script, but we actually want it to modify the original script - # so we can print the modified version under "Script:" below. - script = scriptInit[:] res = runOnce(execdir) if isinstance(res, lit.Test.Result): return res @@ -2101,7 +2196,7 @@ def runOnce(execdir): status = Test.FLAKYPASS # Form the output log. - output = """Script:\n--\n%s\n--\nExit Code: %d\n""" % ("\n".join(script), exitCode) + output = f"Exit Code: {exitCode}\n" if timeoutInfo is not None: output += """Timeout: %s\n""" % (timeoutInfo,) @@ -2123,6 +2218,8 @@ def executeShTest( return lit.Test.Result(Test.UNSUPPORTED, "Test is unsupported") script = list(preamble_commands) + script = [buildPdbgCommand(f"preamble command line", ln) for ln in script] + parsed = parseIntegratedTestScript(test, require_script=not script) if isinstance(parsed, lit.Test.Result): return parsed diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py index 747824574dd67..132476fb2a367 100644 --- a/llvm/utils/lit/lit/cl_arguments.py +++ b/llvm/utils/lit/lit/cl_arguments.py @@ -72,22 +72,23 @@ def parse_args(): "-v", "--verbose", dest="showOutput", - help="Show test output for failures", + help="For failed tests, show all output. For example, each command is" + " printed before it is executed, so the last printed command is the one" + " that failed.", action="store_true", ) format_group.add_argument( "-vv", "--echo-all-commands", - dest="echoAllCommands", + dest="showOutput", + help="Deprecated alias for -v.", action="store_true", - help="Echo all commands as they are executed to stdout. In case of " - "failure, last command shown will be the failing one.", ) format_group.add_argument( "-a", "--show-all", dest="showAllOutput", - help="Display all commandlines and output", + help="Enable -v, but for all tests not just failed tests.", action="store_true", ) format_group.add_argument( @@ -299,9 +300,6 @@ def parse_args(): opts = parser.parse_args(args) # Validate command line options - if opts.echoAllCommands: - opts.showOutput = True - if opts.incremental: print( "WARNING: --incremental is deprecated. Failing tests now always run first." diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py index 6858961752a66..3cb47c605ad5a 100755 --- a/llvm/utils/lit/lit/main.py +++ b/llvm/utils/lit/lit/main.py @@ -40,7 +40,6 @@ def main(builtin_params={}): order=opts.order, params=params, config_prefix=opts.configPrefix, - echo_all_commands=opts.echoAllCommands, per_test_coverage=opts.per_test_coverage, ) diff --git a/llvm/utils/lit/tests/Inputs/lit-opts/lit.cfg b/llvm/utils/lit/tests/Inputs/lit-opts/lit.cfg index cf1a4f1ba1ec7..301208c25bdc7 100644 --- a/llvm/utils/lit/tests/Inputs/lit-opts/lit.cfg +++ b/llvm/utils/lit/tests/Inputs/lit-opts/lit.cfg @@ -5,4 +5,4 @@ config.suffixes = [".txt"] config.test_format = lit.formats.ShTest() config.test_source_root = None config.test_exec_root = None -config.substitutions.append(("%var", lit_config.params.get("var", ""))) +config.substitutions.append(("%var", lit_config.params.get("var", "default"))) diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/examples/param-subst.txt b/llvm/utils/lit/tests/Inputs/shtest-define/examples/param-subst.txt index 6dd9b16304e05..1d94c12137234 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/examples/param-subst.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/examples/param-subst.txt @@ -14,21 +14,21 @@ ; REDEFINE: %{cflags} = -triple x86_64-apple-darwin10.6.0 -fopenmp-simd ; REDEFINE: %{fcflags} = -check-prefix=SIMD ; RUN: %{check} -; CHECK: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -triple x86_64-apple-darwin10.6.0 -fopenmp-simd -emit-llvm -o - {{.*}} | FileCheck -check-prefix=SIMD {{.*}} +; CHECK:# | %clang_cc1 -verify -fopenmp -fopenmp-version=51 -triple x86_64-apple-darwin10.6.0 -fopenmp-simd -emit-llvm -o - {{.*}} | FileCheck -check-prefix=SIMD {{.*}} ; REDEFINE: %{cflags} = -triple x86_64-unknown-linux-gnu -fopenmp-simd ; REDEFINE: %{fcflags} = -check-prefix=SIMD ; RUN: %{check} -; CHECK: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -triple x86_64-unknown-linux-gnu -fopenmp-simd -emit-llvm -o - {{.*}} | FileCheck -check-prefix=SIMD {{.*}} +; CHECK:# | %clang_cc1 -verify -fopenmp -fopenmp-version=51 -triple x86_64-unknown-linux-gnu -fopenmp-simd -emit-llvm -o - {{.*}} | FileCheck -check-prefix=SIMD {{.*}} ; REDEFINE: %{cflags} = -triple x86_64-apple-darwin10.6.0 ; REDEFINE: %{fcflags} = -check-prefix=NO-SIMD ; RUN: %{check} -; CHECK: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -triple x86_64-apple-darwin10.6.0 -emit-llvm -o - {{.*}} | FileCheck -check-prefix=NO-SIMD {{.*}} +; CHECK:# | %clang_cc1 -verify -fopenmp -fopenmp-version=51 -triple x86_64-apple-darwin10.6.0 -emit-llvm -o - {{.*}} | FileCheck -check-prefix=NO-SIMD {{.*}} ; REDEFINE: %{cflags} = -triple x86_64-unknown-linux-gnu ; REDEFINE: %{fcflags} = -check-prefix=NO-SIMD ; RUN: %{check} -; CHECK: %clang_cc1 -verify -fopenmp -fopenmp-version=51 -triple x86_64-unknown-linux-gnu -emit-llvm -o - {{.*}} | FileCheck -check-prefix=NO-SIMD {{.*}} +; CHECK:# | %clang_cc1 -verify -fopenmp -fopenmp-version=51 -triple x86_64-unknown-linux-gnu -emit-llvm -o - {{.*}} | FileCheck -check-prefix=NO-SIMD {{.*}} ; CHECK: Passed: 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/expansion-order.txt b/llvm/utils/lit/tests/Inputs/shtest-define/expansion-order.txt index 3bf057151afb7..23c0a6d507533 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/expansion-order.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/expansion-order.txt @@ -7,7 +7,7 @@ # # REDEFINE: %{global:greeting}=Hello # RUN: %{global:echo} -# CHECK: GLOBAL: Hello World +# CHECK:# | GLOBAL: Hello World # We can redefine the test suite config's substitutions multiple times. Again, # the expansion order remains the same (%{global:echo} before %{global:greeting} @@ -17,7 +17,7 @@ # REDEFINE: %{global:greeting}=Goodbye %{global:what} # REDEFINE: %{global:what}=Sleep # RUN: %{global:echo} -# CHECK: GLOBAL: Goodbye Sleep Sleep +# CHECK:# | GLOBAL: Goodbye Sleep Sleep # A new local substitution is prepended to the substitution list so that it can # depend on all substitutions that were defined previously, including those from @@ -26,7 +26,7 @@ # DEFINE: %{local:greeting}=Hey %{global:what} # DEFINE: %{local:echo}=echo "LOCAL: %{local:greeting} %{global:what}" # RUN: %{local:echo} -# CHECK: LOCAL: Hey Sleep Sleep +# CHECK:# | LOCAL: Hey Sleep Sleep # As for substitutions from the test suite config, redefining local # substitutions should not change the expansion order. Again, the expansion @@ -36,6 +36,6 @@ # REDEFINE: %{local:greeting}=So Long %{global:what} # REDEFINE: %{global:what}=World # RUN: %{local:echo} -# CHECK: LOCAL: So Long World World +# CHECK:# | LOCAL: So Long World World # CHECK: Passed: 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/line-number-substitutions.txt b/llvm/utils/lit/tests/Inputs/shtest-define/line-number-substitutions.txt index 65f90792ff7bc..5a1d7f2e19877 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/line-number-substitutions.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/line-number-substitutions.txt @@ -1,19 +1,19 @@ # Does it work as expected directly in RUN lines? # RUN: echo %(line), %(line-1), %(line+2) -# CHECK: [[#@LINE-1]], [[#@LINE-2]], [[#@LINE+1]] +# CHECK:# | [[#@LINE-1]], [[#@LINE-2]], [[#@LINE+1]] # %(line) substitutions refer to the original DEFINE/REDEFINE line not the RUN # line they eventually appear within. # # DEFINE: %{lines} = %(line) # RUN: echo '%{lines}' -# CHECK: [[#@LINE-2]] +# CHECK:# | [[#@LINE-2]] # # REDEFINE: %{lines} = %(line), \ # REDEFINE: %(line), \ # REDEFINE: %(line) # RUN: echo '%{lines}' -# CHECK: [[#@LINE-4]], [[#@LINE-3]], [[#@LINE-2]] +# CHECK:# | [[#@LINE-4]], [[#@LINE-3]], [[#@LINE-2]] # %(line+N) and %{line-N) should work too. # @@ -21,12 +21,12 @@ # DEFINE: %(line), \ # DEFINE: %(line-1) # RUN: echo '%{lines-rel}' -# CHECK: [[#@LINE-3]], [[#@LINE-3]], [[#@LINE-3]] +# CHECK:# | [[#@LINE-3]], [[#@LINE-3]], [[#@LINE-3]] # # REDEFINE: %{lines-rel} = %(line+5), \ # REDEFINE: %(line+0), \ # REDEFINE: %(line-10) # RUN: echo '%{lines-rel}' -# CHECK: [[#@LINE+1]], [[#@LINE-3]], [[#@LINE-12]] +# CHECK:# | [[#@LINE+1]], [[#@LINE-3]], [[#@LINE-12]] # CHECK: Passed: 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/name-chars.txt b/llvm/utils/lit/tests/Inputs/shtest-define/name-chars.txt index 18a27cdd72fa6..d27fda2e2fe6e 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/name-chars.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/name-chars.txt @@ -1,25 +1,25 @@ # DEFINE: %{abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789} = ok # RUN: echo '%{abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789}' -# CHECK: ok +# CHECK:# | ok # DEFINE: %{FooBar} = ok at %(line) # RUN: echo '%{FooBar}' -# CHECK: ok at [[#@LINE - 2]] +# CHECK:# | ok at [[#@LINE - 2]] # DEFINE: %{fooBar} = ok at %(line) # RUN: echo '%{fooBar}' -# CHECK: ok at [[#@LINE - 2]] +# CHECK:# | ok at [[#@LINE - 2]] # DEFINE: %{foo-bar-} = ok at %(line) # RUN: echo '%{foo-bar-}' -# CHECK: ok at [[#@LINE - 2]] +# CHECK:# | ok at [[#@LINE - 2]] # DEFINE: %{foo:bar:} = ok at %(line) # RUN: echo '%{foo:bar:}' -# CHECK: ok at [[#@LINE - 2]] +# CHECK:# | ok at [[#@LINE - 2]] # DEFINE: %{_foo_bar_} = ok at %(line) # RUN: echo '%{_foo_bar_}' -# CHECK: ok at [[#@LINE - 2]] +# CHECK:# | ok at [[#@LINE - 2]] # CHECK: Passed: 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/recursiveExpansionLimit.txt b/llvm/utils/lit/tests/Inputs/shtest-define/recursiveExpansionLimit.txt index eb5f0b918fd19..e504b822ace81 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/recursiveExpansionLimit.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/recursiveExpansionLimit.txt @@ -6,7 +6,7 @@ # RUN: echo '%{outer}' -# CHECK-NON-RECUR:%{inner} -# CHECK-RECUR:expanded +# CHECK-NON-RECUR:# | %{inner} +# CHECK-RECUR:# | expanded # CHECK: Passed: 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/value-equals.txt b/llvm/utils/lit/tests/Inputs/shtest-define/value-equals.txt index 9d2e7197fb3a6..c1bdfef30b1a0 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/value-equals.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/value-equals.txt @@ -2,21 +2,21 @@ # DEFINE: %{equals} = FileCheck -check-prefixes=FOO,BAR # RUN: echo '%{equals}' -# CHECK: FileCheck -check-prefixes=FOO,BAR +# CHECK:# | FileCheck -check-prefixes=FOO,BAR # # REDEFINE: %{equals} == == = # RUN: echo '%{equals}' -# CHECK: = == = +# CHECK:# | = == = # DEFINE: %{continue-equals} = FileCheck -strict-whitespace -match-full-lines \ # DEFINE: -check-prefixes=FOO,BAR # RUN: echo '%{continue-equals}' -# CHECK: FileCheck -strict-whitespace -match-full-lines -check-prefixes=FOO,BAR +# CHECK:# | FileCheck -strict-whitespace -match-full-lines -check-prefixes=FOO,BAR # # REDEFINE: %{continue-equals} = FileCheck -input-file=test.txt \ # REDEFINE: -implicit-check-not=foobar \ # REDEFINE: -check-prefixes=FOO,BAR # RUN: echo '%{continue-equals}' -# CHECK: FileCheck -input-file=test.txt -implicit-check-not=foobar -check-prefixes=FOO,BAR +# CHECK:# | FileCheck -input-file=test.txt -implicit-check-not=foobar -check-prefixes=FOO,BAR # CHECK: Passed: 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt b/llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt index 58d5c1a34ce89..9143796edd32d 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/value-escaped.txt @@ -6,11 +6,11 @@ # DEFINE: %{escape} = \g<0>\n # RUN: echo '%{escape}' -# CHECK: {{\\?}}\g<0>{{\\?}}\n +# CHECK:# | {{\\?}}\g<0>{{\\?}}\n # REDEFINE: %{escape} = \n \ # REDEFINE: \g # RUN: echo '%{escape}' -# CHECK: {{\\?}}\n {{\\?}}\g +# CHECK:# | {{\\?}}\n {{\\?}}\g # CHECK: Passed: 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-define/ws-and-continuations.txt b/llvm/utils/lit/tests/Inputs/shtest-define/ws-and-continuations.txt index 1259e511ba701..3e4db1b15d927 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-define/ws-and-continuations.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-define/ws-and-continuations.txt @@ -3,33 +3,33 @@ # # DEFINE: %{empty}= # RUN: echo "'%{empty}'" -# CHECK:'' +# CHECK:# | '' # # REDEFINE: %{empty}= # RUN: echo "'%{empty}'" -# CHECK:'' +# CHECK:# | '' # A value consisting only of whitespace is trimmed to the empty string. # # v~~ intentional whitespace # DEFINE: %{ws}= # RUN: echo "'%{ws}'" -# CHECK:'' +# CHECK:# | '' # # v intentional whitespace # REDEFINE: %{ws}= # RUN: echo "'%{ws}'" -# CHECK:'' +# CHECK:# | '' # Whitespace is not required around the name or value. # # DEFINE:%{no-whitespace}=abc # RUN: echo "'%{no-whitespace}'" -# CHECK:'abc' +# CHECK:# | 'abc' # # REDEFINE:%{no-whitespace}=HelloWorld # RUN: echo "'%{no-whitespace}'" -# CHECK:'HelloWorld' +# CHECK:# | 'HelloWorld' # Whitespace is not required between substitutions in a value. # @@ -37,11 +37,11 @@ # DEFINE: %{adjacent1} = bar # DEFINE: %{has-adjacent-substs} = %{adjacent0}%{adjacent1} # RUN: echo "'%{has-adjacent-substs}'" -# CHECK:'foobar' +# CHECK:# | 'foobar' # # REDEFINE: %{has-adjacent-substs} = %{adjacent0}%{adjacent1}%{adjacent0} # RUN: echo "'%{has-adjacent-substs}'" -# CHECK:'foobarfoo' +# CHECK:# | 'foobarfoo' # Exact whitespace is preserved within the value, but whitespace enclosing the # name or value is discarded. ('%{' and '}' are part of the name, and @@ -50,11 +50,11 @@ # v~~ intentional whitespace # DEFINE: %{whitespace} = abc def # RUN: echo "'%{whitespace}'" -# CHECK:'abc def' +# CHECK:# | 'abc def' # v intentional whitespace # REDEFINE: %{whitespace} = Hello World # RUN: echo "'%{whitespace}'" -# CHECK:'Hello World' +# CHECK:# | 'Hello World' # Line continuations in the value are permitted and collapse whitespace. # @@ -66,12 +66,12 @@ # DEFINE: pqr # ^ intentional whitespace # RUN: echo "'%{continue}'" -# CHECK:'abc def ghi jkl mno pqr' +# CHECK:# | 'abc def ghi jkl mno pqr' # # REDEFINE: %{continue} = abc \ # REDEFINE: def # RUN: echo "'%{continue}'" -# CHECK:'abc def' +# CHECK:# | 'abc def' # Whitespace at the end of the line after a '\' is ignored, and it's treated as # a line continuation. Otherwise, the behavior would be hard to understand @@ -83,7 +83,7 @@ # ^ intentional whitespace # DEFINE: baz # RUN: echo "'%{ws-after-continue}'" -# CHECK:'foo bar baz' +# CHECK:# | 'foo bar baz' # # v intentional whitespace # REDEFINE: %{ws-after-continue}=foo \ @@ -91,7 +91,7 @@ # ^~~~~~~~~~~~ intentional whitespace # REDEFINE: baz # RUN: echo "'%{ws-after-continue}'" -# CHECK:'foo bar baz' +# CHECK:# | 'foo bar baz' # A line continuation is recognized anywhere. It should be used only where # whitespace is permitted because it reduces to a single space. @@ -107,7 +107,7 @@ # DEFINE:\ # DEFINE:a # RUN: echo "'%{blank-lines}'" -# CHECK:'a' +# CHECK:# | 'a' # # REDEFINE: \ # REDEFINE: %{blank-lines} \ @@ -120,7 +120,7 @@ # REDEFINE: \ # REDEFINE: c # RUN: echo "'%{blank-lines}'" -# CHECK:'a b c' +# CHECK:# | 'a b c' # The fourth DEFINE line is deceptive because it looks like a new substitution, # but it's actually a continuation of the previous value. @@ -130,6 +130,6 @@ # DEFINE: %{deceptive-continue}=echo \ # DEFINE: %{name}=%{value} # RUN: %{deceptive-continue} -# CHECK:x=3 +# CHECK:# | x=3 # CHECK:{{ *}}Passed: 1 diff --git a/llvm/utils/lit/tests/Inputs/shtest-external-shell-kill/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-external-shell-kill/lit.cfg new file mode 100644 index 0000000000000..d10594dc525f5 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-external-shell-kill/lit.cfg @@ -0,0 +1,5 @@ +import lit.formats + +config.test_format = lit.formats.ShTest(execute_external=True) +config.name = "shtest-external-shell-kill" +config.suffixes = [".txt"] diff --git a/llvm/utils/lit/tests/Inputs/shtest-external-shell-kill/test.txt b/llvm/utils/lit/tests/Inputs/shtest-external-shell-kill/test.txt new file mode 100644 index 0000000000000..dbdf2d689425c --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-external-shell-kill/test.txt @@ -0,0 +1,5 @@ +# RUN: echo start +# RUN: sleep 300 & PID=$! +# RUN: sleep 2 +# RUN: kill $PID +# RUN: echo end diff --git a/llvm/utils/lit/tests/Inputs/shtest-if-else/test.txt b/llvm/utils/lit/tests/Inputs/shtest-if-else/test.txt index 805a74de3a7ee..b5fc1b49fcc52 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-if-else/test.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-if-else/test.txt @@ -1,29 +1,34 @@ # CHECK: -- Testing:{{.*}} # CHECK-NEXT: PASS: shtest-if-else :: test.txt (1 of 1) -# CHECK-NEXT: Script: +# CHECK: Command Output (stdout): # CHECK-NEXT: -- # RUN: %if feature %{ echo "test-1" %} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]'; echo "test-1" +# CHECK: # {{RUN}}: at line [[#@LINE-1]] +# CHECK-NEXT: echo "test-1" # If %else is not present it is treated like %else %{%}. Empty commands # are ignored. # # RUN: %if nofeature %{ echo "fail" %} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]' +# CHECK: # {{RUN}}: at line [[#@LINE-1]] has no command after substitutions # CHECK-NOT: fail # RUN: %if nofeature %{ echo "fail" %} %else %{ echo "test-2" %} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]'; echo "test-2" +# CHECK: # {{RUN}}: at line [[#@LINE-1]] +# CHECK-NEXT: echo "test-2" # Spaces inside curly braces are not ignored # # RUN: echo test-%if feature %{ 3 %} %else %{ fail %}-test # RUN: echo test-%if feature %{ 4 4 %} %else %{ fail %}-test # RUN: echo test-%if nofeature %{ fail %} %else %{ 5 5 %}-test -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo test- 3 -test -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo test- 4 4 -test -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo test- 5 5 -test +# CHECK: # {{RUN}}: at line [[#@LINE-3]] +# CHECK-NEXT: echo test- 3 -test +# CHECK: # {{RUN}}: at line [[#@LINE-4]] +# CHECK-NEXT: echo test- 4 4 -test +# CHECK: # {{RUN}}: at line [[#@LINE-5]] +# CHECK-NEXT: echo test- 5 5 -test # Escape line breaks for multi-line expressions # @@ -31,27 +36,32 @@ # RUN: %{ echo \ # RUN: "test-5" \ # RUN: %} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-4]]'; echo "test-5" +# CHECK: # {{RUN}}: at line [[#@LINE-4]] +# CHECK-NEXT: echo "test-5" # RUN: %if nofeature \ # RUN: %{ echo "fail" %} \ # RUN: %else \ # RUN: %{ echo "test-6" %} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-4]]'; echo "test-6" +# CHECK: # {{RUN}}: at line [[#@LINE-4]] +# CHECK-NEXT: echo "test-6" # RUN: echo "test%if feature %{%} %else %{%}-7" -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]'; echo "test-7" +# CHECK: # {{RUN}}: at line [[#@LINE-1]] +# CHECK-NEXT: echo "test-7" # Escape %if. Without %if..%else context '%{' and '%}' are treated # literally. # # RUN: echo %%if feature %{ echo "test-8" %} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]'; echo %if feature %{ echo "test-8" %} +# CHECK: # {{RUN}}: at line [[#@LINE-1]] +# CHECK-NEXT: echo %if feature %{ echo "test-8" %} # Nested expressions are supported: # # RUN: echo %if feature %{ %if feature %{ %if nofeature %{"fail"%} %else %{"test-9"%} %} %} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]'; echo "test-9" +# CHECK: # {{RUN}}: at line [[#@LINE-1]] +# CHECK-NEXT: echo "test-9" # Binary expression evaluation and regex match can be used as # conditions. @@ -59,9 +69,12 @@ # RUN: echo %if feature && !nofeature %{ "test-10" %} # RUN: echo %if feature && nofeature %{ "fail" %} %else %{ "test-11" %} # RUN: echo %if {{fea.+}} %{ "test-12" %} %else %{ "fail" %} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo "test-10" -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo "test-11" -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo "test-12" +# CHECK: # {{RUN}}: at line [[#@LINE-3]] +# CHECK-NEXT: echo "test-10" +# CHECK: # {{RUN}}: at line [[#@LINE-4]] +# CHECK-NEXT: echo "test-11" +# CHECK: # {{RUN}}: at line [[#@LINE-5]] +# CHECK-NEXT: echo "test-12" # Spaces between %if and %else are ignored. If there is no %else - # space after %if %{...%} is not ignored. @@ -69,24 +82,27 @@ # RUN: echo XX %if feature %{YY%} ZZ # RUN: echo AA %if feature %{BB%} %else %{CC%} DD # RUN: echo AA %if nofeature %{BB%} %else %{CC%} DD -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo XX YY ZZ -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo AA BB DD -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-3]]'; echo AA CC DD +# CHECK: # {{RUN}}: at line [[#@LINE-3]] +# CHECK-NEXT: echo XX YY ZZ +# CHECK: # {{RUN}}: at line [[#@LINE-4]] +# CHECK-NEXT: echo AA BB DD +# CHECK: # {{RUN}}: at line [[#@LINE-5]] +# CHECK-NEXT: echo AA CC DD # '{' and '}' can be used without escaping # # RUN: %if feature %{echo {}%} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]'; echo {} +# CHECK: # {{RUN}}: at line [[#@LINE-1]] +# CHECK-NEXT: echo {} # Spaces are not required # # RUN: echo %if feature%{"ok"%}%else%{"fail"%} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]'; echo "ok" +# CHECK: # {{RUN}}: at line [[#@LINE-1]] +# CHECK-NEXT: echo "ok" # Substitutions with braces are handled correctly # # RUN: echo %{sub} %if feature%{test-%{sub}%}%else%{"fail"%} -# CHECK-NEXT: {{^.*'RUN}}: at line [[#@LINE-1]]'; echo ok test-ok - -# CHECK-NEXT: -- -# CHECK-NEXT: Exit Code: 0 +# CHECK: # {{RUN}}: at line [[#@LINE-1]] +# CHECK-NEXT: echo ok test-ok diff --git a/llvm/utils/lit/tests/Inputs/shtest-output-printing/basic.txt b/llvm/utils/lit/tests/Inputs/shtest-output-printing/basic.txt index fb3a3e0007891..5ff0c891450bb 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-output-printing/basic.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-output-printing/basic.txt @@ -1,3 +1,4 @@ # RUN: true # RUN: echo hi -# RUN: not not wc missing-file &> %t.out +# RUN: not not wc missing-file &> %t.out || true +# RUN: not %{python} %S/write-a-lot.py &> %t.out diff --git a/llvm/utils/lit/tests/Inputs/shtest-output-printing/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-output-printing/lit.cfg index b872854d21e63..141fc3a596e77 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-output-printing/lit.cfg +++ b/llvm/utils/lit/tests/Inputs/shtest-output-printing/lit.cfg @@ -2,4 +2,5 @@ import lit.formats config.name = "shtest-output-printing" config.suffixes = [".txt"] +config.substitutions.append(("%{python}", f'"{sys.executable}"')) config.test_format = lit.formats.ShTest(execute_external=False) diff --git a/llvm/utils/lit/tests/Inputs/shtest-output-printing/write-a-lot.py b/llvm/utils/lit/tests/Inputs/shtest-output-printing/write-a-lot.py new file mode 100644 index 0000000000000..5dbb94a41cbfd --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-output-printing/write-a-lot.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python + +import sys + +sys.stdout.write("All work and no play makes Jack a dull boy.\n" * 1000) +sys.stdout.flush() diff --git a/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/empty-run-line.txt b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/empty-run-line.txt new file mode 100644 index 0000000000000..40a5a7d6e7cce --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/external-shell/empty-run-line.txt @@ -0,0 +1,3 @@ +# DEFINE: %{empty} = +# RUN: %{empty} +# RUN: false diff --git a/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/empty-run-line.txt b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/empty-run-line.txt new file mode 100644 index 0000000000000..40a5a7d6e7cce --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-run-at-line/internal-shell/empty-run-line.txt @@ -0,0 +1,3 @@ +# DEFINE: %{empty} = +# RUN: %{empty} +# RUN: false diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/echo-at-redirect-stderr.txt b/llvm/utils/lit/tests/Inputs/shtest-shell/echo-at-redirect-stderr.txt new file mode 100644 index 0000000000000..15a87aee46a37 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/echo-at-redirect-stderr.txt @@ -0,0 +1 @@ +RUN: @echo 2> %t diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/echo-at-redirect-stdin.txt b/llvm/utils/lit/tests/Inputs/shtest-shell/echo-at-redirect-stdin.txt new file mode 100644 index 0000000000000..27fd0c4209fd6 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/echo-at-redirect-stdin.txt @@ -0,0 +1,2 @@ +RUN: touch %t +RUN: @echo < %t diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/echo-redirect-stderr.txt b/llvm/utils/lit/tests/Inputs/shtest-shell/echo-redirect-stderr.txt new file mode 100644 index 0000000000000..9611918f7e124 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/echo-redirect-stderr.txt @@ -0,0 +1 @@ +RUN: echo 2> %t diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/echo-redirect-stdin.txt b/llvm/utils/lit/tests/Inputs/shtest-shell/echo-redirect-stdin.txt new file mode 100644 index 0000000000000..bc771be6b22a6 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/echo-redirect-stdin.txt @@ -0,0 +1,2 @@ +RUN: touch %t +RUN: echo < %t diff --git a/llvm/utils/lit/tests/allow-retries.py b/llvm/utils/lit/tests/allow-retries.py index bf6c041933887..45610fb70d348 100644 --- a/llvm/utils/lit/tests/allow-retries.py +++ b/llvm/utils/lit/tests/allow-retries.py @@ -15,9 +15,28 @@ # This test does not succeed within the allowed retry limit # -# RUN: not %{lit} %{inputs}/allow-retries/does-not-succeed-within-limit.py | FileCheck --check-prefix=CHECK-TEST3 %s -# CHECK-TEST3: Failed Tests (1): -# CHECK-TEST3: allow-retries :: does-not-succeed-within-limit.py +# Check that the execution trace isn't corrupt due to reprocessing the script +# multiple times (e.g., '%dbg(...)' processing used to accumulate across +# retries). +# +# RUN: not %{lit} %{inputs}/allow-retries/does-not-succeed-within-limit.py -v |\ +# RUN: FileCheck --check-prefix=CHECK-TEST3 -match-full-lines %s +# +# CHECK-TEST3: FAIL: allow-retries :: does-not-succeed-within-limit.py (1 of 1) +# CHECK-TEST3-NEXT: {{\**}} TEST 'allow-retries :: does-not-succeed-within-limit.py' FAILED {{\**}} +# CHECK-TEST3-NEXT: Exit Code: 1 +# CHECK-TEST3-EMPTY: +# CHECK-TEST3-NEXT: Command Output (stdout): +# CHECK-TEST3-NEXT: -- +# CHECK-TEST3-NEXT: # {{RUN}}: at line 3 +# CHECK-TEST3-NEXT: false +# CHECK-TEST3-NEXT: # executed command: false +# CHECK-TEST3-NEXT: # note: command had no output on stdout or stderr +# CHECK-TEST3-NEXT: # error: command failed with exit status: 1 +# CHECK-TEST3-EMPTY: +# CHECK-TEST3-NEXT: -- +# CHECK-TEST3: Failed Tests (1): +# CHECK-TEST3: allow-retries :: does-not-succeed-within-limit.py # This test should be UNRESOLVED since it has more than one ALLOW_RETRIES # lines, and that is not allowed. @@ -48,6 +67,6 @@ # RUN: -Dcounter=%t.counter -Dpython=%{python} | \ # RUN: FileCheck --check-prefix=CHECK-TEST7 %s # CHECK-TEST7: Command Output (stdout): -# CHECK-TEST7: LLVM_PROFILE_FILE= -# CHECK-TEST7-NOT: LLVM_PROFILE_FILE= +# CHECK-TEST7: # executed command: export LLVM_PROFILE_FILE= +# CHECK-TEST7-NOT: # executed command: export LLVM_PROFILE_FILE= # CHECK-TEST7: Passed With Retry: 1 diff --git a/llvm/utils/lit/tests/lit-opts.py b/llvm/utils/lit/tests/lit-opts.py index d292ca74f3b9b..a533a59d9d124 100644 --- a/llvm/utils/lit/tests/lit-opts.py +++ b/llvm/utils/lit/tests/lit-opts.py @@ -8,7 +8,7 @@ # # RUN: env LIT_OPTS=-a \ # RUN: %{lit} -s %{inputs}/lit-opts \ -# RUN: | FileCheck -check-prefix=SHOW-ALL -DVAR= %s +# RUN: | FileCheck -check-prefix=SHOW-ALL -DVAR=default %s # Check that LIT_OPTS understands multiple options with arbitrary spacing. # @@ -28,6 +28,6 @@ # SHOW-ALL: Testing: 1 tests # SHOW-ALL: PASS: lit-opts :: test.txt (1 of 1) -# SHOW-ALL: {{^}}[[VAR]] +# SHOW-ALL: echo [[VAR]] # SHOW-ALL-NOT: PASS # SHOW-ALL: Passed: 1 diff --git a/llvm/utils/lit/tests/shtest-define.py b/llvm/utils/lit/tests/shtest-define.py index 1d0997bbbaf80..8c9309804ccea 100644 --- a/llvm/utils/lit/tests/shtest-define.py +++ b/llvm/utils/lit/tests/shtest-define.py @@ -136,15 +136,15 @@ # RUN: %{lit} -va %{my-inputs}/shared-substs-*.txt 2>&1 | \ # RUN: FileCheck -check-prefix=SHARED-SUBSTS -match-full-lines %s # -# SHARED-SUBSTS: shared-substs-0.txt -# SHARED-SUBSTS: GLOBAL: World -# SHARED-SUBSTS: LOCAL0: LOCAL0:Hello LOCAL0:World -# SHARED-SUBSTS: LOCAL0: subst +# SHARED-SUBSTS:# | shared-substs-0.txt +# SHARED-SUBSTS:# | GLOBAL: World +# SHARED-SUBSTS:# | LOCAL0: LOCAL0:Hello LOCAL0:World +# SHARED-SUBSTS:# | LOCAL0: subst # -# SHARED-SUBSTS: shared-substs-1.txt -# SHARED-SUBSTS: GLOBAL: World -# SHARED-SUBSTS: LOCAL1: LOCAL1:Hello LOCAL1:World -# SHARED-SUBSTS: LOCAL1: subst +# SHARED-SUBSTS:# | shared-substs-1.txt +# SHARED-SUBSTS:# | GLOBAL: World +# SHARED-SUBSTS:# | LOCAL1: LOCAL1:Hello LOCAL1:World +# SHARED-SUBSTS:# | LOCAL1: subst # # REDEFINE: %{test} = shared-substs-0.txt # RUN: %{record-test} diff --git a/llvm/utils/lit/tests/shtest-env.py b/llvm/utils/lit/tests/shtest-env.py index f2e8216f7f4a7..c093b62e5420f 100644 --- a/llvm/utils/lit/tests/shtest-env.py +++ b/llvm/utils/lit/tests/shtest-env.py @@ -10,88 +10,115 @@ # CHECK: -- Testing: 16 tests{{.*}} # CHECK: FAIL: shtest-env :: env-args-last-is-assign.txt ({{[^)]*}}) -# CHECK: $ "env" "FOO=1" -# CHECK: Error: 'env' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env FOO=1 +# CHECK: # executed command: env FOO=1 +# CHECK: # | Error: 'env' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-args-last-is-u-arg.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" "FOO" -# CHECK: Error: 'env' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u FOO +# CHECK: # executed command: env -u FOO +# CHECK: # | Error: 'env' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-args-last-is-u.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" -# CHECK: Error: 'env' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u +# CHECK: # executed command: env -u +# CHECK: # | Error: 'env' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-args-nested-none.txt ({{[^)]*}}) -# CHECK: $ "env" "env" "env" -# CHECK: Error: 'env' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env env env +# CHECK: # executed command: env env env +# CHECK: # | Error: 'env' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-args-none.txt ({{[^)]*}}) -# CHECK: $ "env" -# CHECK: Error: 'env' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env +# CHECK: # executed command: env +# CHECK: # | Error: 'env' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-calls-cd.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" "FOO" "BAR=3" "cd" "foobar" -# CHECK: Error: 'env' cannot call 'cd' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u FOO BAR=3 cd foobar +# CHECK: # executed command: env -u FOO BAR=3 cd foobar +# CHECK: # | Error: 'env' cannot call 'cd' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-calls-colon.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" "FOO" "BAR=3" ":" -# CHECK: Error: 'env' cannot call ':' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u FOO BAR=3 : +# CHECK: # executed command: env -u FOO BAR=3 : +# CHECK: # | Error: 'env' cannot call ':' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-calls-echo.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" "FOO" "BAR=3" "echo" "hello" "world" -# CHECK: Error: 'env' cannot call 'echo' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u FOO BAR=3 echo hello world +# CHECK: # executed command: env -u FOO BAR=3 echo hello world +# CHECK: # | Error: 'env' cannot call 'echo' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: PASS: shtest-env :: env-calls-env.txt ({{[^)]*}}) -# CHECK: $ "env" "env" "{{[^"]*}}" "print_environment.py" -# CHECK: $ "env" "FOO=2" "env" "BAR=1" "{{[^"]*}}" "print_environment.py" -# CHECK: $ "env" "-u" "FOO" "env" "-u" "BAR" "{{[^"]*}}" "print_environment.py" -# CHECK: $ "env" "-u" "FOO" "BAR=1" "env" "-u" "BAR" "FOO=2" "{{[^"]*}}" "print_environment.py" -# CHECK: $ "env" "-u" "FOO" "BAR=1" "env" "-u" "BAR" "FOO=2" "env" "BAZ=3" "{{[^"]*}}" "print_environment.py" -# CHECK-NOT: ${{.*}}print_environment.py +# CHECK: env env [[PYTHON:.+]] print_environment.py | {{.*}} +# CHECK: # executed command: env env [[PYTHON_BARE:.+]] print_environment.py +# CHECK: env FOO=2 env BAR=1 [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env FOO=2 env BAR=1 [[PYTHON_BARE]] print_environment.py +# CHECK: env -u FOO env -u BAR [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env -u FOO env -u BAR [[PYTHON_BARE]] print_environment.py +# CHECK: env -u FOO BAR=1 env -u BAR FOO=2 [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env -u FOO BAR=1 env -u BAR FOO=2 [[PYTHON_BARE]] print_environment.py +# CHECK: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env -u FOO BAR=1 env -u BAR FOO=2 env BAZ=3 [[PYTHON_BARE]] print_environment.py +# CHECK-NOT: {{^[^#]}} +# CHECK: -- # CHECK: FAIL: shtest-env :: env-calls-export.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" "FOO" "BAR=3" "export" "BAZ=3" -# CHECK: Error: 'env' cannot call 'export' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u FOO BAR=3 export BAZ=3 +# CHECK: # executed command: env -u FOO BAR=3 export BAZ=3 +# CHECK: # | Error: 'env' cannot call 'export' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-calls-mkdir.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" "FOO" "BAR=3" "mkdir" "foobar" -# CHECK: Error: 'env' cannot call 'mkdir' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u FOO BAR=3 mkdir foobar +# CHECK: # executed command: env -u FOO BAR=3 mkdir foobar +# CHECK: # | Error: 'env' cannot call 'mkdir' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-calls-not-builtin.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" "FOO" "BAR=3" "not" "rm" "{{.*}}.no-such-file" -# CHECK: Error: 'env' cannot call 'rm' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u FOO BAR=3 not rm {{.+}}.no-such-file +# CHECK: # executed command: env -u FOO BAR=3 not rm {{.+}}.no-such-file{{.*}} +# CHECK: # | Error: 'env' cannot call 'rm' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-env :: env-calls-rm.txt ({{[^)]*}}) -# CHECK: $ "env" "-u" "FOO" "BAR=3" "rm" "foobar" -# CHECK: Error: 'env' cannot call 'rm' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: env -u FOO BAR=3 rm foobar +# CHECK: # executed command: env -u FOO BAR=3 rm foobar +# CHECK: # | Error: 'env' cannot call 'rm' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: PASS: shtest-env :: env-u.txt ({{[^)]*}}) -# CHECK: $ "{{[^"]*}}" "print_environment.py" -# CHECK: $ "env" "-u" "FOO" "{{[^"]*}}" "print_environment.py" -# CHECK: $ "env" "-u" "FOO" "-u" "BAR" "{{[^"]*}}" "print_environment.py" -# CHECK-NOT: ${{.*}}print_environment.py +# CHECK: [[PYTHON]] print_environment.py | {{.*}} +# CHECK: env -u FOO [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env -u FOO [[PYTHON_BARE]] print_environment.py +# CHECK: env -u FOO -u BAR [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env -u FOO -u BAR [[PYTHON_BARE]] print_environment.py +# CHECK-NOT: {{^[^#]}} +# CHECK: -- # CHECK: PASS: shtest-env :: env.txt ({{[^)]*}}) -# CHECK: $ "env" "A_FOO=999" "{{[^"]*}}" "print_environment.py" -# CHECK: $ "env" "A_FOO=1" "B_BAR=2" "C_OOF=3" "{{[^"]*}}" "print_environment.py" -# CHECK-NOT: ${{.*}}print_environment.py +# CHECK: env A_FOO=999 [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env A_FOO=999 [[PYTHON_BARE]] print_environment.py +# CHECK: env A_FOO=1 B_BAR=2 C_OOF=3 [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env A_FOO=1 B_BAR=2 C_OOF=3 [[PYTHON_BARE]] print_environment.py +# CHECK-NOT: {{^[^#]}} +# CHECK: -- # CHECK: PASS: shtest-env :: mixed.txt ({{[^)]*}}) -# CHECK: $ "env" "A_FOO=999" "-u" "FOO" "{{[^"]*}}" "print_environment.py" -# CHECK: $ "env" "A_FOO=1" "-u" "FOO" "B_BAR=2" "-u" "BAR" "C_OOF=3" "{{[^"]*}}" "print_environment.py" -# CHECK-NOT: ${{.*}}print_environment.py +# CHECK: env A_FOO=999 -u FOO [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env A_FOO=999 -u FOO [[PYTHON_BARE]] print_environment.py +# CHECK: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 [[PYTHON]] print_environment.py | {{.*}} +# CHECK: # executed command: env A_FOO=1 -u FOO B_BAR=2 -u BAR C_OOF=3 [[PYTHON_BARE]] print_environment.py +# CHECK-NOT: {{^[^#]}} +# CHECK: -- # CHECK: Passed: 4 # CHECK: Failed: 12 diff --git a/llvm/utils/lit/tests/shtest-external-shell-kill.py b/llvm/utils/lit/tests/shtest-external-shell-kill.py new file mode 100644 index 0000000000000..73f8d8601af62 --- /dev/null +++ b/llvm/utils/lit/tests/shtest-external-shell-kill.py @@ -0,0 +1,36 @@ +# This test exercises an external shell use case that, at least at one time, +# appeared in the following tests: +# +# compiler-rt/test/fuzzer/fork-sigusr.test +# compiler-rt/test/fuzzer/merge-sigusr.test +# compiler-rt/test/fuzzer/sigint.test +# compiler-rt/test/fuzzer/sigusr.test +# +# That is, a RUN line can be: +# +# cmd & PID=$! +# +# It is important that '&' only puts 'cmd' in the background and not the +# debugging commands that lit inserts before 'cmd'. Otherwise: +# +# - The debugging commands might execute later than they are supposed to. +# - A later 'kill $PID' can kill more than just 'cmd'. We've seen it even +# manage to terminate the shell running lit. +# +# The last FileCheck directive below checks that the debugging commands for the +# above RUN line are not killed and do execute at the right time. + +# RUN: %{lit} -a %{inputs}/shtest-external-shell-kill | FileCheck %s +# END. + +# CHECK: Command Output (stdout): +# CHECK-NEXT: -- +# CHECK-NEXT: start +# CHECK-NEXT: end +# CHECK-EMPTY: +# CHECK-NEXT: -- +# CHECK-NEXT: Command Output (stderr): +# CHECK-NEXT: -- +# CHECK-NEXT: RUN: at line 1: echo start +# CHECK-NEXT: echo start +# CHECK-NEXT: RUN: at line 2: sleep [[#]] & PID=$! diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py index b41a3f763f384..4a3d65b7bce4f 100644 --- a/llvm/utils/lit/tests/shtest-format.py +++ b/llvm/utils/lit/tests/shtest-format.py @@ -37,22 +37,27 @@ # CHECK: PASS: shtest-format :: external_shell/pass.txt -# CHECK: FAIL: shtest-format :: fail.txt -# CHECK-NEXT: *** TEST 'shtest-format :: fail.txt' FAILED *** -# CHECK-NEXT: Script: -# CHECK-NEXT: -- -# CHECK-NEXT: printf "line 1 -# CHECK-NEXT: false -# CHECK-NEXT: -- -# CHECK-NEXT: Exit Code: 1 -# -# CHECK: Command Output (stdout): -# CHECK-NEXT: -- -# CHECK-NEXT: $ ":" "RUN: at line 1" -# CHECK-NEXT: $ "printf" -# CHECK-NEXT: # command output: -# CHECK-NEXT: line 1: failed test output on stdout -# CHECK-NEXT: line 2: failed test output on stdout +# CHECK: FAIL: shtest-format :: fail.txt +# CHECK-NEXT: *** TEST 'shtest-format :: fail.txt' FAILED *** +# CHECK-NEXT: Exit Code: 1 +# CHECK-EMPTY: +# CHECK-NEXT: Command Output (stdout): +# CHECK-NEXT: -- +# CHECK-NEXT: # RUN: at line 1 +# CHECK-NEXT: printf "line 1: failed test output on stdout\nline 2: failed test output on stdout" +# CHECK-NEXT: executed command: printf 'line 1: failed test output on stdout\nline 2: failed test output on stdout' +# CHECK-NEXT: # .---command stdout------------ +# CHECK-NEXT: # | line 1: failed test output on stdout +# CHECK-NEXT: # | line 2: failed test output on stdout +# CHECK-NEXT: # `----------------------------- +# CHECK-NEXT: # RUN: at line 2 +# CHECK-NEXT: false +# CHECK-NEXT: # executed command: false +# CHECK-NEXT: # note: command had no output on stdout or stderr +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-EMPTY: +# CHECK-NEXT: -- + # CHECK: UNRESOLVED: shtest-format :: no-test-line.txt # CHECK: PASS: shtest-format :: pass.txt @@ -69,12 +74,18 @@ # CHECK: XFAIL: shtest-format :: xfail-feature.txt # CHECK: XFAIL: shtest-format :: xfail-target.txt # CHECK: XFAIL: shtest-format :: xfail.txt -# CHECK: XPASS: shtest-format :: xpass.txt -# CHECK-NEXT: *** TEST 'shtest-format :: xpass.txt' FAILED *** -# CHECK-NEXT: Script -# CHECK-NEXT: -- -# CHECK-NEXT: true -# CHECK-NEXT: -- + +# CHECK: XPASS: shtest-format :: xpass.txt +# CHECK-NEXT: *** TEST 'shtest-format :: xpass.txt' FAILED *** +# CHECK-NEXT: Exit Code: 0 +# CHECK-EMPTY: +# CHECK-NEXT: Command Output (stdout): +# CHECK-NEXT: -- +# CHECK-NEXT: # RUN: at line 1 +# CHECK-NEXT: true +# CHECK-NEXT: # executed command: true +# CHECK-EMPTY: +# CHECK-NEXT: -- # CHECK: Failed Tests (4) # CHECK: shtest-format :: external_shell/fail.txt @@ -109,13 +120,16 @@ # XUNIT: # XUNIT-NEXT: -# XUNIT: -# XUNIT-NEXT: -# XUNIT-NEXT: +# XUNIT: +# XUNIT-NEXT: +# XUNIT-NEXT: # XUNIT: diff --git a/llvm/utils/lit/tests/shtest-if-else.py b/llvm/utils/lit/tests/shtest-if-else.py index aaf94a6e24372..c18da4abbcca5 100644 --- a/llvm/utils/lit/tests/shtest-if-else.py +++ b/llvm/utils/lit/tests/shtest-if-else.py @@ -1,5 +1,6 @@ # RUN: %{lit} -v --show-all %{inputs}/shtest-if-else/test.txt \ -# RUN: | FileCheck %{inputs}/shtest-if-else/test.txt --match-full-lines +# RUN: | FileCheck %{inputs}/shtest-if-else/test.txt --match-full-lines \ +# RUN: --implicit-check-not='RUN:' # RUN: not %{lit} -v --show-all %{inputs}/shtest-if-else/test-neg1.txt 2>&1 \ # RUN: | FileCheck %{inputs}/shtest-if-else/test-neg1.txt diff --git a/llvm/utils/lit/tests/shtest-inject.py b/llvm/utils/lit/tests/shtest-inject.py index c6fa799ac3bda..3d34eb7161d46 100644 --- a/llvm/utils/lit/tests/shtest-inject.py +++ b/llvm/utils/lit/tests/shtest-inject.py @@ -1,27 +1,31 @@ -# Check that we can inject commands at the beginning of a ShTest. +# Check that we can inject preamble commands at the beginning of a ShTest. +# +# For one case, check the execution trace as these preamble commands have +# "preamble command" instead of the usual "{{RUN}}: at line N". # RUN: %{lit} %{inputs}/shtest-inject/test-empty.txt --show-all | FileCheck --check-prefix=CHECK-TEST1 %s # -# CHECK-TEST1: Script: -# CHECK-TEST1: -- -# CHECK-TEST1: echo "THIS WAS" -# CHECK-TEST1: echo "INJECTED" -# CHECK-TEST1: -- -# -# CHECK-TEST1: THIS WAS -# CHECK-TEST1: INJECTED +# CHECK-TEST1: Command Output (stdout): +# CHECK-TEST1-NEXT: -- +# CHECK-TEST1-NEXT: # preamble command line +# CHECK-TEST1-NEXT: echo "THIS WAS" +# CHECK-TEST1-NEXT: # executed command: echo 'THIS WAS' +# CHECK-TEST1-NEXT: # .---command stdout{{-*}} +# CHECK-TEST1-NEXT: # | THIS WAS +# CHECK-TEST1-NEXT: # `---{{-*}} +# CHECK-TEST1-NEXT: # preamble command line +# CHECK-TEST1-NEXT: echo "INJECTED" +# CHECK-TEST1-NEXT: # executed command: echo INJECTED +# CHECK-TEST1-NEXT: # .---command stdout{{-*}} +# CHECK-TEST1-NEXT: # | INJECTED +# CHECK-TEST1-NEXT: # `---{{-*}} +# CHECK-TEST1-EMPTY: +# CHECK-TEST1-NEXT: -- # # CHECK-TEST1: Passed: 1 # RUN: %{lit} %{inputs}/shtest-inject/test-one.txt --show-all | FileCheck --check-prefix=CHECK-TEST2 %s # -# CHECK-TEST2: Script: -# CHECK-TEST2: -- -# CHECK-TEST2: echo "THIS WAS" -# CHECK-TEST2: echo "INJECTED" -# CHECK-TEST2: echo "IN THE FILE" -# CHECK-TEST2: -- -# # CHECK-TEST2: THIS WAS # CHECK-TEST2: INJECTED # CHECK-TEST2: IN THE FILE @@ -30,15 +34,6 @@ # RUN: %{lit} %{inputs}/shtest-inject/test-many.txt --show-all | FileCheck --check-prefix=CHECK-TEST3 %s # -# CHECK-TEST3: Script: -# CHECK-TEST3: -- -# CHECK-TEST3: echo "THIS WAS" -# CHECK-TEST3: echo "INJECTED" -# CHECK-TEST3: echo "IN THE FILE" -# CHECK-TEST3: echo "IF IT WORKS" -# CHECK-TEST3: echo "AS EXPECTED" -# CHECK-TEST3: -- -# # CHECK-TEST3: THIS WAS # CHECK-TEST3: INJECTED # CHECK-TEST3: IN THE FILE diff --git a/llvm/utils/lit/tests/shtest-not.py b/llvm/utils/lit/tests/shtest-not.py index 53bd6356ad930..f514cddf1deff 100644 --- a/llvm/utils/lit/tests/shtest-not.py +++ b/llvm/utils/lit/tests/shtest-not.py @@ -10,132 +10,179 @@ # CHECK: -- Testing: 17 tests{{.*}} # CHECK: FAIL: shtest-not :: exclamation-args-nested-none.txt {{.*}} -# CHECK: $ "!" "!" "!" -# CHECK: Error: '!' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: ! ! ! +# CHECK: # executed command: ! ! ! +# CHECK: # | Error: '!' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: exclamation-args-none.txt {{.*}} -# CHECK: $ "!" -# CHECK: Error: '!' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: ! +# CHECK: # executed command: ! +# CHECK: # | Error: '!' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: exclamation-calls-external.txt {{.*}} -# CHECK: $ "!" "{{[^"]*}}" "fail.py" -# CHECK: $ "!" "!" "{{[^"]*}}" "pass.py" -# CHECK: $ "!" "!" "!" "{{[^"]*}}" "fail.py" -# CHECK: $ "!" "!" "!" "!" "{{[^"]*}}" "pass.py" +# CHECK: ! [[PYTHON:.*]] fail.py +# CHECK: # executed command: ! [[PYTHON_BARE:.*]] fail.py +# CHECK: ! ! [[PYTHON]] pass.py +# CHECK: # executed command: ! ! [[PYTHON_BARE]] pass.py +# CHECK: ! ! ! [[PYTHON]] fail.py +# CHECK: # executed command: ! ! ! [[PYTHON_BARE]] fail.py +# CHECK: ! ! ! ! [[PYTHON]] pass.py +# CHECK: # executed command: ! ! ! ! [[PYTHON_BARE]] pass.py -# CHECK: $ "!" "{{[^"]*}}" "pass.py" -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: ! [[PYTHON]] pass.py +# CHECK: # executed command: ! [[PYTHON_BARE]] pass.py +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-args-last-is-crash.txt {{.*}} -# CHECK: $ "not" "--crash" -# CHECK: Error: 'not' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not --crash +# CHECK: # executed command: not --crash +# CHECK: # | Error: 'not' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-args-nested-none.txt {{.*}} -# CHECK: $ "not" "not" "not" -# CHECK: Error: 'not' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not not not +# CHECK: # executed command: not not not +# CHECK: # | Error: 'not' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-args-none.txt {{.*}} -# CHECK: $ "not" -# CHECK: Error: 'not' requires a subcommand -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not +# CHECK: # executed command: not +# CHECK: # | Error: 'not' requires a subcommand +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-calls-cd.txt {{.*}} -# CHECK: $ "not" "not" "cd" "foobar" -# CHECK: $ "not" "--crash" "cd" "foobar" -# CHECK: Error: 'not --crash' cannot call 'cd' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not not cd foobar +# CHECK: # executed command: not not cd foobar +# CHECK: not --crash cd foobar +# CHECK: # executed command: not --crash cd foobar +# CHECK: # | Error: 'not --crash' cannot call 'cd' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-calls-colon.txt {{.*}} -# CHECK: $ "not" "not" ":" "foobar" -# CHECK: $ "not" "--crash" ":" -# CHECK: Error: 'not --crash' cannot call ':' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not not : foobar +# CHECK: # executed command: not not : foobar +# CHECK: not --crash : +# CHECK: # executed command: not --crash : +# CHECK: # | Error: 'not --crash' cannot call ':' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-calls-diff-with-crash.txt {{.*}} -# CHECK: $ "not" "--crash" "diff" "-u" {{.*}} -# CHECK-NOT: "$" +# CHECK: not --crash diff -u {{.*}} +# CHECK: # executed command: not --crash diff -u {{.*}} +# CHECK-NOT: # executed command: {{.*}} # CHECK-NOT: {{[Ee]rror}} -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: # error: command failed with exit status: {{.*}} +# CHECK-NOT: # executed command: {{.*}} # CHECK-NOT: {{[Ee]rror}} -# CHECK-NOT: "$" # CHECK: FAIL: shtest-not :: not-calls-diff.txt {{.*}} -# CHECK: $ "not" "diff" {{.*}} -# CHECK: $ "not" "not" "not" "diff" {{.*}} -# CHECK: $ "not" "not" "not" "not" "not" "diff" {{.*}} -# CHECK: $ "diff" {{.*}} -# CHECK: $ "not" "not" "diff" {{.*}} -# CHECK: $ "not" "not" "not" "not" "diff" {{.*}} -# CHECK: $ "not" "diff" {{.*}} -# CHECK-NOT: "$" +# CHECK: not diff {{.*}} +# CHECK: # executed command: not diff {{.*}} +# CHECK: not not not diff {{.*}} +# CHECK: # executed command: not not not diff {{.*}} +# CHECK: not not not not not diff {{.*}} +# CHECK: # executed command: not not not not not diff {{.*}} +# CHECK: diff {{.*}} +# CHECK: # executed command: diff {{.*}} +# CHECK: not not diff {{.*}} +# CHECK: # executed command: not not diff {{.*}} +# CHECK: not not not not diff {{.*}} +# CHECK: # executed command: not not not not diff {{.*}} +# CHECK: not diff {{.*}} +# CHECK: # executed command: not diff {{.*}} +# CHECK-NOT: # executed command: {{.*}} # CHECK: FAIL: shtest-not :: not-calls-echo.txt {{.*}} -# CHECK: $ "not" "not" "echo" "hello" "world" -# CHECK: $ "not" "--crash" "echo" "hello" "world" -# CHECK: Error: 'not --crash' cannot call 'echo' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not not echo hello world +# CHECK: # executed command: not not echo hello world +# CHECK: not --crash echo hello world +# CHECK: # executed command: not --crash echo hello world +# CHECK: # | Error: 'not --crash' cannot call 'echo' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-calls-env-builtin.txt {{.*}} -# CHECK: $ "not" "--crash" "env" "-u" "FOO" "BAR=3" "rm" "{{.*}}.no-such-file" -# CHECK: Error: 'env' cannot call 'rm' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not --crash env -u FOO BAR=3 rm {{.*}}.no-such-file +# CHECK: # executed command: not --crash env -u FOO BAR=3 rm {{.+}}.no-such-file{{.*}} +# CHECK: # | Error: 'env' cannot call 'rm' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-calls-export.txt {{.*}} -# CHECK: $ "not" "not" "export" "FOO=1" -# CHECK: $ "not" "--crash" "export" "BAZ=3" -# CHECK: Error: 'not --crash' cannot call 'export' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not not export FOO=1 +# CHECK: # executed command: not not export FOO=1 +# CHECK: not --crash export BAZ=3 +# CHECK: # executed command: not --crash export BAZ=3 +# CHECK: # | Error: 'not --crash' cannot call 'export' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: PASS: shtest-not :: not-calls-external.txt {{.*}} -# CHECK: $ "not" "{{[^"]*}}" "fail.py" -# CHECK: $ "not" "not" "{{[^"]*}}" "pass.py" -# CHECK: $ "not" "not" "not" "{{[^"]*}}" "fail.py" -# CHECK: $ "not" "not" "not" "not" "{{[^"]*}}" "pass.py" - -# CHECK: $ "not" "not" "--crash" "{{[^"]*}}" "pass.py" -# CHECK: $ "not" "not" "--crash" "{{[^"]*}}" "fail.py" -# CHECK: $ "not" "not" "--crash" "not" "{{[^"]*}}" "pass.py" -# CHECK: $ "not" "not" "--crash" "not" "{{[^"]*}}" "fail.py" - -# CHECK: $ "env" "not" "{{[^"]*}}" "fail.py" -# CHECK: $ "not" "env" "{{[^"]*}}" "fail.py" -# CHECK: $ "env" "FOO=1" "not" "{{[^"]*}}" "fail.py" -# CHECK: $ "not" "env" "FOO=1" "BAR=1" "{{[^"]*}}" "fail.py" -# CHECK: $ "env" "FOO=1" "BAR=1" "not" "env" "-u" "FOO" "BAR=2" "{{[^"]*}}" "fail.py" -# CHECK: $ "not" "env" "FOO=1" "BAR=1" "not" "env" "-u" "FOO" "-u" "BAR" "{{[^"]*}}" "pass.py" -# CHECK: $ "not" "not" "env" "FOO=1" "env" "FOO=2" "BAR=1" "{{[^"]*}}" "pass.py" -# CHECK: $ "env" "FOO=1" "-u" "BAR" "env" "-u" "FOO" "BAR=1" "not" "not" "{{[^"]*}}" "pass.py" - -# CHECK: $ "not" "env" "FOO=1" "BAR=1" "env" "FOO=2" "BAR=2" "not" "--crash" "{{[^"]*}}" "pass.py" -# CHECK: $ "not" "env" "FOO=1" "BAR=1" "not" "--crash" "not" "{{[^"]*}}" "pass.py" -# CHECK: $ "not" "not" "--crash" "env" "-u" "BAR" "not" "env" "-u" "FOO" "BAR=1" "{{[^"]*}}" "pass.py" +# CHECK: not [[PYTHON]] fail.py +# CHECK: # executed command: not [[PYTHON_BARE]] fail.py +# CHECK: not not [[PYTHON]] pass.py +# CHECK: # executed command: not not [[PYTHON_BARE]] pass.py +# CHECK: not not not [[PYTHON]] fail.py +# CHECK: # executed command: not not not [[PYTHON_BARE]] fail.py +# CHECK: not not not not [[PYTHON]] pass.py +# CHECK: # executed command: not not not not [[PYTHON_BARE]] pass.py + +# CHECK: not not --crash [[PYTHON]] pass.py +# CHECK: # executed command: not not --crash [[PYTHON_BARE]] pass.py +# CHECK: not not --crash [[PYTHON]] fail.py +# CHECK: # executed command: not not --crash [[PYTHON_BARE]] fail.py +# CHECK: not not --crash not [[PYTHON]] pass.py +# CHECK: # executed command: not not --crash not [[PYTHON_BARE]] pass.py +# CHECK: not not --crash not [[PYTHON]] fail.py +# CHECK: # executed command: not not --crash not [[PYTHON_BARE]] fail.py + +# CHECK: env not [[PYTHON]] fail.py | {{.*}} +# CHECK: # executed command: env not [[PYTHON_BARE]] fail.py +# CHECK: not env [[PYTHON]] fail.py | {{.*}} +# CHECK: # executed command: not env [[PYTHON_BARE]] fail.py +# CHECK: env FOO=1 not [[PYTHON]] fail.py | {{.*}} +# CHECK: # executed command: env FOO=1 not [[PYTHON_BARE]] fail.py +# CHECK: not env FOO=1 BAR=1 [[PYTHON]] fail.py | {{.*}} +# CHECK: # executed command: not env FOO=1 BAR=1 [[PYTHON_BARE]] fail.py +# CHECK: env FOO=1 BAR=1 not env -u FOO BAR=2 [[PYTHON]] fail.py | {{.*}} +# CHECK: # executed command: env FOO=1 BAR=1 not env -u FOO BAR=2 [[PYTHON_BARE]] fail.py +# CHECK: not env FOO=1 BAR=1 not env -u FOO -u BAR [[PYTHON]] pass.py | {{.*}} +# CHECK: # executed command: not env FOO=1 BAR=1 not env -u FOO -u BAR [[PYTHON_BARE]] pass.py +# CHECK: not not env FOO=1 env FOO=2 BAR=1 [[PYTHON]] pass.py | {{.*}} +# CHECK: # executed command: not not env FOO=1 env FOO=2 BAR=1 [[PYTHON_BARE]] pass.py +# CHECK: env FOO=1 -u BAR env -u FOO BAR=1 not not [[PYTHON]] pass.py | {{.*}} +# CHECK: # executed command: env FOO=1 -u BAR env -u FOO BAR=1 not not [[PYTHON_BARE]] pass.py + +# CHECK: not env FOO=1 BAR=1 env FOO=2 BAR=2 not --crash [[PYTHON]] pass.py | {{.*}} +# CHECK: # executed command: not env FOO=1 BAR=1 env FOO=2 BAR=2 not --crash [[PYTHON_BARE]] pass.py +# CHECK: not env FOO=1 BAR=1 not --crash not [[PYTHON]] pass.py | {{.*}} +# CHECK: # executed command: not env FOO=1 BAR=1 not --crash not [[PYTHON_BARE]] pass.py +# CHECK: not not --crash env -u BAR not env -u FOO BAR=1 [[PYTHON]] pass.py | {{.*}} +# CHECK: # executed command: not not --crash env -u BAR not env -u FOO BAR=1 [[PYTHON_BARE]] pass.py # CHECK: FAIL: shtest-not :: not-calls-fail2.txt {{.*}} # CHECK-NEXT: {{.*}} TEST 'shtest-not :: not-calls-fail2.txt' FAILED {{.*}} -# CHECK-NEXT: Script: -# CHECK-NEXT: -- -# CHECK: -- # CHECK-NEXT: Exit Code: 1 # CHECK: FAIL: shtest-not :: not-calls-mkdir.txt {{.*}} -# CHECK: $ "not" "mkdir" {{.*}} -# CHECK: $ "not" "--crash" "mkdir" "foobar" -# CHECK: Error: 'not --crash' cannot call 'mkdir' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not mkdir {{.*}} +# CHECK: # executed command: not mkdir {{.*}} +# CHECK: not --crash mkdir foobar +# CHECK: # executed command: not --crash mkdir foobar +# CHECK: # | Error: 'not --crash' cannot call 'mkdir' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: FAIL: shtest-not :: not-calls-rm.txt {{.*}} -# CHECK: $ "not" "rm" {{.*}} -# CHECK: $ "not" "--crash" "rm" "foobar" -# CHECK: Error: 'not --crash' cannot call 'rm' -# CHECK: error: command failed with exit status: {{.*}} +# CHECK: not rm {{.*}} +# CHECK: # executed command: not rm {{.*}} +# CHECK: not --crash rm foobar +# CHECK: # executed command: not --crash rm foobar +# CHECK: # | Error: 'not --crash' cannot call 'rm' +# CHECK: # error: command failed with exit status: {{.*}} # CHECK: Passed: 1 # CHECK: Failed: 16 diff --git a/llvm/utils/lit/tests/shtest-output-printing.py b/llvm/utils/lit/tests/shtest-output-printing.py index d5ec413fa04be..129cff981eb5b 100644 --- a/llvm/utils/lit/tests/shtest-output-printing.py +++ b/llvm/utils/lit/tests/shtest-output-printing.py @@ -1,31 +1,45 @@ # Check the various features of the ShTest format. # # RUN: not %{lit} -v %{inputs}/shtest-output-printing > %t.out -# RUN: FileCheck --input-file %t.out %s +# RUN: FileCheck --input-file %t.out --match-full-lines %s # # END. -# CHECK: -- Testing: - -# CHECK: FAIL: shtest-output-printing :: basic.txt -# CHECK-NEXT: *** TEST 'shtest-output-printing :: basic.txt' FAILED *** -# CHECK-NEXT: Script: -# CHECK-NEXT: -- -# CHECK: -- -# CHECK-NEXT: Exit Code: 1 -# -# CHECK: Command Output -# CHECK-NEXT: -- -# CHECK-NEXT: $ ":" "RUN: at line 1" -# CHECK-NEXT: $ "true" -# CHECK-NEXT: $ ":" "RUN: at line 2" -# CHECK-NEXT: $ "echo" "hi" -# CHECK-NEXT: # command output: -# CHECK-NEXT: hi -# -# CHECK: $ ":" "RUN: at line 3" -# CHECK-NEXT: $ "not" "not" "wc" "missing-file" -# CHECK-NEXT: # redirected output from '{{.*(/|\\\\)}}basic.txt.tmp.out': -# CHECK-NEXT: {{cannot open missing-file|missing-file.* No such file or directory}} -# CHECK: note: command had no output on stdout or stderr -# CHECK-NEXT: error: command failed with exit status: 1 +# CHECK: -- Testing: {{.*}} +# CHECK: FAIL: shtest-output-printing :: basic.txt {{.*}} +# CHECK-NEXT: ***{{\**}} TEST 'shtest-output-printing :: basic.txt' FAILED ***{{\**}} +# CHECK-NEXT: Exit Code: 1 +# CHECK-EMPTY: +# CHECK-NEXT: Command Output (stdout): +# CHECK-NEXT: -- +# CHECK-NEXT: # RUN: at line 1 +# CHECK-NEXT: true +# CHECK-NEXT: # executed command: true +# CHECK-NEXT: # RUN: at line 2 +# CHECK-NEXT: echo hi +# CHECK-NEXT: # executed command: echo hi +# CHECK-NEXT: # .---command stdout------------ +# CHECK-NEXT: # | hi +# CHECK-NEXT: # `----------------------------- +# CHECK-NEXT: # RUN: at line 3 +# CHECK-NEXT: not not wc missing-file &> [[FILE:.*]] || true +# CHECK-NEXT: # executed command: not not wc missing-file +# CHECK-NEXT: # .---redirected output from '[[FILE]]' +# CHECK-NEXT: # | wc: {{cannot open missing-file|missing-file.* No such file or directory}} +# CHECK-NEXT: # `----------------------------- +# CHECK-NEXT: # note: command had no output on stdout or stderr +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true +# CHECK-NEXT: # RUN: at line 4 +# CHECK-NEXT: not {{.*}}python{{.*}} {{.*}}write-a-lot.py &> [[FILE:.*]] +# CHECK-NEXT: # executed command: not {{.*}}python{{.*}} {{.*}}write-a-lot.py{{.*}} +# CHECK-NEXT: # .---redirected output from '[[FILE]]' +# CHECK-NEXT: # | All work and no play makes Jack a dull boy. +# CHECK-NEXT: # | All work and no play makes Jack a dull boy. +# CHECK-NEXT: # | All work and no play makes Jack a dull boy. +# CHECK: # | ... +# CHECK-NEXT: # `---data was truncated-------- +# CHECK-NEXT: # note: command had no output on stdout or stderr +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-EMPTY: +# CHECK-NEXT:-- diff --git a/llvm/utils/lit/tests/shtest-pushd-popd.py b/llvm/utils/lit/tests/shtest-pushd-popd.py index 26296a7ffcf59..6d7e93c74a050 100644 --- a/llvm/utils/lit/tests/shtest-pushd-popd.py +++ b/llvm/utils/lit/tests/shtest-pushd-popd.py @@ -8,16 +8,16 @@ # CHECK: -- Testing: 4 tests{{.*}} # CHECK: FAIL: shtest-pushd-popd :: popd-args.txt ({{[^)]*}}) -# CHECK: $ "popd" "invalid" -# CHECK: 'popd' does not support arguments +# CHECK: popd invalid +# CHECK: # | 'popd' does not support arguments # CHECK: FAIL: shtest-pushd-popd :: popd-no-stack.txt ({{[^)]*}}) -# CHECK: $ "popd" -# CHECK: popd: directory stack empty +# CHECK: popd +# CHECK: # | popd: directory stack empty # CHECK: FAIL: shtest-pushd-popd :: pushd-too-many-args.txt ({{[^)]*}}) -# CHECK: $ "pushd" "a" "b" -# CHECK: 'pushd' supports only one argument +# CHECK: pushd a b +# CHECK: # | 'pushd' supports only one argument # CHECK: Passed: 1 # CHECK: Failed: 3 diff --git a/llvm/utils/lit/tests/shtest-recursive-substitution.py b/llvm/utils/lit/tests/shtest-recursive-substitution.py index 48f4b5b124911..65c177e65a3c7 100644 --- a/llvm/utils/lit/tests/shtest-recursive-substitution.py +++ b/llvm/utils/lit/tests/shtest-recursive-substitution.py @@ -3,7 +3,7 @@ # RUN: %{lit} %{inputs}/shtest-recursive-substitution/substitutes-within-limit --show-all | FileCheck --check-prefix=CHECK-TEST1 %s # CHECK-TEST1: PASS: substitutes-within-limit :: test.py -# CHECK-TEST1: $ "echo" "STOP" +# CHECK-TEST1: echo STOP # RUN: not %{lit} %{inputs}/shtest-recursive-substitution/does-not-substitute-within-limit --show-all | FileCheck --check-prefix=CHECK-TEST2 %s # CHECK-TEST2: UNRESOLVED: does-not-substitute-within-limit :: test.py @@ -11,7 +11,7 @@ # RUN: %{lit} %{inputs}/shtest-recursive-substitution/does-not-substitute-no-limit --show-all | FileCheck --check-prefix=CHECK-TEST3 %s # CHECK-TEST3: PASS: does-not-substitute-no-limit :: test.py -# CHECK-TEST3: $ "echo" "%rec4" +# CHECK-TEST3: echo %rec4 # RUN: not %{lit} %{inputs}/shtest-recursive-substitution/not-an-integer --show-all 2>&1 | FileCheck --check-prefix=CHECK-TEST4 %s # CHECK-TEST4: recursiveExpansionLimit must be either None or an integer @@ -24,4 +24,4 @@ # RUN: %{lit} %{inputs}/shtest-recursive-substitution/escaping --show-all | FileCheck --check-prefix=CHECK-TEST7 %s # CHECK-TEST7: PASS: escaping :: test.py -# CHECK-TEST7: $ "echo" "%s" "%s" "%%s" +# CHECK-TEST7: echo %s %s %%s diff --git a/llvm/utils/lit/tests/shtest-run-at-line.py b/llvm/utils/lit/tests/shtest-run-at-line.py index ccd85b505b7e4..b92aee97628b2 100644 --- a/llvm/utils/lit/tests/shtest-run-at-line.py +++ b/llvm/utils/lit/tests/shtest-run-at-line.py @@ -1,13 +1,12 @@ -# Check that -vv makes the line number of the failing RUN command clear. -# (-v is actually sufficient in the case of the internal shell.) +# Check that -a/-v/-vv makes the line number of the failing RUN command clear. -# RUN: not %{lit} -vv %{inputs}/shtest-run-at-line > %t.out -# RUN: FileCheck --input-file %t.out %s -# +# RUN: not %{lit} -a %{inputs}/shtest-run-at-line | FileCheck %s +# RUN: not %{lit} -v %{inputs}/shtest-run-at-line | FileCheck %s +# RUN: not %{lit} -vv %{inputs}/shtest-run-at-line | FileCheck %s # END. -# CHECK: Testing: 4 tests +# CHECK: Testing: 6 tests # In the case of the external shell, we check for only RUN lines in stderr in @@ -15,56 +14,75 @@ # CHECK-LABEL: FAIL: shtest-run-at-line :: external-shell/basic.txt -# CHECK: Script: -# CHECK: RUN: at line 4{{.*}} true -# CHECK-NEXT: RUN: at line 5{{.*}} false -# CHECK-NEXT: RUN: at line 6{{.*}} true - -# CHECK: RUN: at line 4 -# CHECK: RUN: at line 5 -# CHECK-NOT: RUN +# CHECK: Command Output (stderr) +# CHECK-NEXT: -- +# CHECK-NEXT: {{^}}RUN: at line 4: true{{$}} +# CHECK-NEXT: true +# CHECK-NEXT: {{^}}RUN: at line 5: false{{$}} +# CHECK-NEXT: false +# CHECK-EMPTY: +# CHECK-NEXT: -- + +# CHECK-LABEL: FAIL: shtest-run-at-line :: external-shell/empty-run-line.txt + +# CHECK: Command Output (stderr) +# CHECK-NEXT: -- +# CHECK-NEXT: {{^}}RUN: at line 2 has no command after substitutions{{$}} +# CHECK-NEXT: {{^}}RUN: at line 3: false{{$}} +# CHECK-NEXT: false +# CHECK-EMPTY: +# CHECK-NEXT: -- # CHECK-LABEL: FAIL: shtest-run-at-line :: external-shell/line-continuation.txt -# CHECK: Script: -# CHECK: RUN: at line 4{{.*}} echo 'foo bar' | FileCheck -# CHECK-NEXT: RUN: at line 6{{.*}} echo 'foo baz' | FileCheck -# CHECK-NEXT: RUN: at line 9{{.*}} echo 'foo bar' | FileCheck +# The execution trace from an external sh-like shell might print the commands +# from a pipeline in any order, so this time just check that lit suppresses the +# trace of the echo command for each 'RUN: at line N: cmd-line'. -# CHECK: RUN: at line 4 -# CHECK: RUN: at line 6 -# CHECK-NOT: RUN +# CHECK: Command Output (stderr) +# CHECK-NEXT: -- +# CHECK-NEXT: {{^}}RUN: at line 4: echo 'foo bar' | FileCheck +# CHECK-NOT: RUN +# CHECK: {{^}}RUN: at line 6: echo 'foo baz' | FileCheck +# CHECK-NOT: RUN +# CHECK: -- # CHECK-LABEL: FAIL: shtest-run-at-line :: internal-shell/basic.txt -# CHECK: Script: -# CHECK: : 'RUN: at line 1'; true -# CHECK-NEXT: : 'RUN: at line 2'; false -# CHECK-NEXT: : 'RUN: at line 3'; true - # CHECK: Command Output (stdout) -# CHECK: $ ":" "RUN: at line 1" -# CHECK-NEXT: $ "true" -# CHECK-NEXT: $ ":" "RUN: at line 2" -# CHECK-NEXT: $ "false" +# CHECK-NEXT: -- +# CHECK-NEXT: # RUN: at line 1 +# CHECK-NEXT: true +# CHECK-NEXT: # executed command: true +# CHECK-NEXT: # RUN: at line 2 +# CHECK-NEXT: false +# CHECK-NEXT: # executed command: false # CHECK-NOT: RUN -# CHECK-LABEL: FAIL: shtest-run-at-line :: internal-shell/line-continuation.txt +# CHECK-LABEL: FAIL: shtest-run-at-line :: internal-shell/empty-run-line.txt -# CHECK: Script: -# CHECK: : 'RUN: at line 1'; : first line continued to second line -# CHECK-NEXT: : 'RUN: at line 3'; echo 'foo bar' | FileCheck -# CHECK-NEXT: : 'RUN: at line 5'; echo 'foo baz' | FileCheck -# CHECK-NEXT: : 'RUN: at line 8'; echo 'foo bar' | FileCheck +# CHECK: Command Output (stdout) +# CHECK-NEXT: -- +# CHECK-NEXT: # RUN: at line 2 has no command after substitutions +# CHECK-NEXT: # RUN: at line 3 +# CHECK-NEXT: false +# CHECK-NEXT: # executed command: false +# CHECK-NOT: RUN + +# CHECK-LABEL: FAIL: shtest-run-at-line :: internal-shell/line-continuation.txt # CHECK: Command Output (stdout) -# CHECK: $ ":" "RUN: at line 1" -# CHECK-NEXT: $ ":" "first" "line" "continued" "to" "second" "line" -# CHECK-NEXT: $ ":" "RUN: at line 3" -# CHECK-NEXT: $ "echo" "foo bar" -# CHECK-NEXT: $ "FileCheck" "{{.*}}" -# CHECK-NEXT: $ ":" "RUN: at line 5" -# CHECK-NEXT: $ "echo" "foo baz" -# CHECK-NEXT: $ "FileCheck" "{{.*}}" +# CHECK-NEXT: -- +# CHECK-NEXT: # RUN: at line 1 +# CHECK-NEXT: : first line continued to second line +# CHECK-NEXT: # executed command: : first line continued to second line +# CHECK-NEXT: # RUN: at line 3 +# CHECK-NEXT: echo 'foo bar' | FileCheck {{.*}} +# CHECK-NEXT: # executed command: echo 'foo bar' +# CHECK-NEXT: # executed command: FileCheck {{.*}} +# CHECK-NEXT: # RUN: at line 5 +# CHECK-NEXT: echo 'foo baz' | FileCheck {{.*}} +# CHECK-NEXT: # executed command: echo 'foo baz' +# CHECK-NEXT: # executed command: FileCheck {{.*}} # CHECK-NOT: RUN diff --git a/llvm/utils/lit/tests/shtest-shell.py b/llvm/utils/lit/tests/shtest-shell.py index 93f05dbd35d0d..a043582d6ae2b 100644 --- a/llvm/utils/lit/tests/shtest-shell.py +++ b/llvm/utils/lit/tests/shtest-shell.py @@ -20,202 +20,212 @@ # CHECK: FAIL: shtest-shell :: cat-error-0.txt # CHECK: *** TEST 'shtest-shell :: cat-error-0.txt' FAILED *** -# CHECK: $ "cat" "-b" "temp1.txt" -# CHECK: # command stderr: -# CHECK: Unsupported: 'cat': option -b not recognized -# CHECK: error: command failed with exit status: 1 +# CHECK: cat -b temp1.txt +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Unsupported: 'cat': option -b not recognized +# CHECK: # error: command failed with exit status: 1 # CHECK: *** # CHECK: FAIL: shtest-shell :: cat-error-1.txt # CHECK: *** TEST 'shtest-shell :: cat-error-1.txt' FAILED *** -# CHECK: $ "cat" "temp1.txt" -# CHECK: # command stderr: -# CHECK: [Errno 2] No such file or directory: 'temp1.txt' -# CHECK: error: command failed with exit status: 1 +# CHECK: cat temp1.txt +# CHECK: # .---command stderr{{-*}} +# CHECK: # | [Errno 2] No such file or directory: 'temp1.txt' +# CHECK: # error: command failed with exit status: 1 # CHECK: *** # CHECK: FAIL: shtest-shell :: colon-error.txt # CHECK: *** TEST 'shtest-shell :: colon-error.txt' FAILED *** -# CHECK: $ ":" -# CHECK: # command stderr: -# CHECK: Unsupported: ':' cannot be part of a pipeline -# CHECK: error: command failed with exit status: 127 +# CHECK: : +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Unsupported: ':' cannot be part of a pipeline +# CHECK: # error: command failed with exit status: 127 # CHECK: *** # CHECK: PASS: shtest-shell :: continuations.txt # CHECK: PASS: shtest-shell :: dev-null.txt -# CHECK: FAIL: shtest-shell :: diff-b.txt -# CHECK: *** TEST 'shtest-shell :: diff-b.txt' FAILED *** -# CHECK: $ "diff" "-b" "{{[^"]*}}.0" "{{[^"]*}}.1" -# CHECK: # command output: -# CHECK: 1,2 -# CHECK-NEXT: {{^ }}f o o -# CHECK-NEXT: ! b a r -# CHECK-NEXT: --- -# CHECK-NEXT: {{^ }}f o o -# CHECK-NEXT: ! bar -# CHECK-EMPTY: -# CHECK: error: command failed with exit status: 1 -# CHECK: *** +# CHECK: FAIL: shtest-shell :: diff-b.txt +# CHECK: *** TEST 'shtest-shell :: diff-b.txt' FAILED *** +# CHECK: diff -b {{[^"]*}}.0 {{[^"]*}}.1 +# CHECK: # .---command stdout{{-*}} +# CHECK: # | {{.*}}1,2 +# CHECK-NEXT: # | f o o +# CHECK-NEXT: # | ! b a r +# CHECK-NEXT: # | --- +# CHECK-NEXT: # | f o o +# CHECK-NEXT: # | ! bar +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK: *** # CHECK: FAIL: shtest-shell :: diff-encodings.txt # CHECK: *** TEST 'shtest-shell :: diff-encodings.txt' FAILED *** -# CHECK: $ "diff" "-u" "diff-in.bin" "diff-in.bin" +# CHECK: diff -u diff-in.bin diff-in.bin +# CHECK-NEXT: # executed command: diff -u diff-in.bin diff-in.bin +# CHECK-NOT: error + +# CHECK: diff -u diff-in.utf16 diff-in.bin && false || true +# CHECK-NEXT: # executed command: diff -u diff-in.utf16 diff-in.bin +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK-NEXT: # | --- +# CHECK-NEXT: # | +++ +# CHECK-NEXT: # | @@ +# CHECK-NEXT: # | {{.f.o.o.$}} +# CHECK-NEXT: # | {{-.b.a.r.$}} +# CHECK-NEXT: # | {{\+.b.a.r.}} +# CHECK-NEXT: # | {{.b.a.z.$}} +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -u diff-in.utf8 diff-in.bin && false || true +# CHECK-NEXT: # executed command: diff -u diff-in.utf8 diff-in.bin +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK-NEXT: # | --- +# CHECK-NEXT: # | +++ +# CHECK-NEXT: # | @@ +# CHECK-NEXT: # | -foo +# CHECK-NEXT: # | -bar +# CHECK-NEXT: # | -baz +# CHECK-NEXT: # | {{\+.f.o.o.$}} +# CHECK-NEXT: # | {{\+.b.a.r.}} +# CHECK-NEXT: # | {{\+.b.a.z.$}} +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -u diff-in.bin diff-in.utf8 && false || true +# CHECK-NEXT: # executed command: diff -u diff-in.bin diff-in.utf8 +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK-NEXT: # | --- +# CHECK-NEXT: # | +++ +# CHECK-NEXT: # | @@ +# CHECK-NEXT: # | {{-.f.o.o.$}} +# CHECK-NEXT: # | {{-.b.a.r.}} +# CHECK-NEXT: # | {{-.b.a.z.$}} +# CHECK-NEXT: # | +foo +# CHECK-NEXT: # | +bar +# CHECK-NEXT: # | +baz +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: cat diff-in.bin | diff -u - diff-in.bin # CHECK-NOT: error -# CHECK: $ "diff" "-u" "diff-in.utf16" "diff-in.bin" -# CHECK: # command output: -# CHECK-NEXT: --- -# CHECK-NEXT: +++ -# CHECK-NEXT: @@ -# CHECK-NEXT: {{^ .f.o.o.$}} -# CHECK-NEXT: {{^-.b.a.r.$}} -# CHECK-NEXT: {{^\+.b.a.r.}} -# CHECK-NEXT: {{^ .b.a.z.$}} -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "diff" "-u" "diff-in.utf8" "diff-in.bin" -# CHECK: # command output: -# CHECK-NEXT: --- -# CHECK-NEXT: +++ -# CHECK-NEXT: @@ -# CHECK-NEXT: -foo -# CHECK-NEXT: -bar -# CHECK-NEXT: -baz -# CHECK-NEXT: {{^\+.f.o.o.$}} -# CHECK-NEXT: {{^\+.b.a.r.}} -# CHECK-NEXT: {{^\+.b.a.z.$}} -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "diff" "-u" "diff-in.bin" "diff-in.utf8" -# CHECK: # command output: -# CHECK-NEXT: --- -# CHECK-NEXT: +++ -# CHECK-NEXT: @@ -# CHECK-NEXT: {{^\-.f.o.o.$}} -# CHECK-NEXT: {{^\-.b.a.r.}} -# CHECK-NEXT: {{^\-.b.a.z.$}} -# CHECK-NEXT: +foo -# CHECK-NEXT: +bar -# CHECK-NEXT: +baz -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "cat" "diff-in.bin" -# CHECK-NOT: error -# CHECK: $ "diff" "-u" "-" "diff-in.bin" -# CHECK-NOT: error - -# CHECK: $ "cat" "diff-in.bin" -# CHECK-NOT: error -# CHECK: $ "diff" "-u" "diff-in.bin" "-" -# CHECK-NOT: error - -# CHECK: $ "cat" "diff-in.bin" -# CHECK-NOT: error -# CHECK: $ "diff" "-u" "diff-in.utf16" "-" -# CHECK: # command output: -# CHECK-NEXT: --- -# CHECK-NEXT: +++ -# CHECK-NEXT: @@ -# CHECK-NEXT: {{^ .f.o.o.$}} -# CHECK-NEXT: {{^-.b.a.r.$}} -# CHECK-NEXT: {{^\+.b.a.r.}} -# CHECK-NEXT: {{^ .b.a.z.$}} -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "cat" "diff-in.bin" +# CHECK: cat diff-in.bin | diff -u diff-in.bin - # CHECK-NOT: error -# CHECK: $ "diff" "-u" "diff-in.utf8" "-" -# CHECK: # command output: -# CHECK-NEXT: --- -# CHECK-NEXT: +++ -# CHECK-NEXT: @@ -# CHECK-NEXT: -foo -# CHECK-NEXT: -bar -# CHECK-NEXT: -baz -# CHECK-NEXT: {{^\+.f.o.o.$}} -# CHECK-NEXT: {{^\+.b.a.r.}} -# CHECK-NEXT: {{^\+.b.a.z.$}} -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "diff" "-u" "-" "diff-in.utf8" -# CHECK: # command output: -# CHECK-NEXT: --- -# CHECK-NEXT: +++ -# CHECK-NEXT: @@ -# CHECK-NEXT: {{^\-.f.o.o.$}} -# CHECK-NEXT: {{^\-.b.a.r.}} -# CHECK-NEXT: {{^\-.b.a.z.$}} -# CHECK-NEXT: +foo -# CHECK-NEXT: +bar -# CHECK-NEXT: +baz -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" -# CHECK: $ "false" +# CHECK: cat diff-in.bin | diff -u diff-in.utf16 - && false || true +# CHECK-NEXT: # executed command: cat diff-in.bin +# CHECK-NEXT: # executed command: diff -u diff-in.utf16 - +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK-NEXT: # | --- +# CHECK-NEXT: # | +++ +# CHECK-NEXT: # | @@ +# CHECK-NEXT: # | {{.f.o.o.$}} +# CHECK-NEXT: # | {{-.b.a.r.$}} +# CHECK-NEXT: # | {{\+.b.a.r.}} +# CHECK-NEXT: # | {{.b.a.z.$}} +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: cat diff-in.bin | diff -u diff-in.utf8 - && false || true +# CHECK-NEXT: # executed command: cat diff-in.bin +# CHECK-NEXT: # executed command: diff -u diff-in.utf8 - +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK-NEXT: # | --- +# CHECK-NEXT: # | +++ +# CHECK-NEXT: # | @@ +# CHECK-NEXT: # | -foo +# CHECK-NEXT: # | -bar +# CHECK-NEXT: # | -baz +# CHECK-NEXT: # | {{\+.f.o.o.$}} +# CHECK-NEXT: # | {{\+.b.a.r.}} +# CHECK-NEXT: # | {{\+.b.a.z.$}} +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: cat diff-in.bin | diff -u - diff-in.utf8 && false || true +# CHECK-NEXT: # executed command: cat diff-in.bin +# CHECK-NEXT: # executed command: diff -u - diff-in.utf8 +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK-NEXT: # | --- +# CHECK-NEXT: # | +++ +# CHECK-NEXT: # | @@ +# CHECK-NEXT: # | {{-.f.o.o.$}} +# CHECK-NEXT: # | {{-.b.a.r.}} +# CHECK-NEXT: # | {{-.b.a.z.$}} +# CHECK-NEXT: # | +foo +# CHECK-NEXT: # | +bar +# CHECK-NEXT: # | +baz +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: false # CHECK: *** # CHECK: FAIL: shtest-shell :: diff-error-1.txt # CHECK: *** TEST 'shtest-shell :: diff-error-1.txt' FAILED *** -# CHECK: $ "diff" "-B" "temp1.txt" "temp2.txt" -# CHECK: # command stderr: -# CHECK: Unsupported: 'diff': option -B not recognized -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -B temp1.txt temp2.txt +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Unsupported: 'diff': option -B not recognized +# CHECK: # error: command failed with exit status: 1 # CHECK: *** # CHECK: FAIL: shtest-shell :: diff-error-2.txt # CHECK: *** TEST 'shtest-shell :: diff-error-2.txt' FAILED *** -# CHECK: $ "diff" "temp.txt" -# CHECK: # command stderr: -# CHECK: Error: missing or extra operand -# CHECK: error: command failed with exit status: 1 +# CHECK: diff temp.txt +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: missing or extra operand +# CHECK: # error: command failed with exit status: 1 # CHECK: *** # CHECK: FAIL: shtest-shell :: diff-error-3.txt # CHECK: *** TEST 'shtest-shell :: diff-error-3.txt' FAILED *** -# CHECK: $ "diff" "temp.txt" "temp1.txt" -# CHECK: # command stderr: -# CHECK: Error: 'diff' command failed +# CHECK: diff temp.txt temp1.txt +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: 'diff' command failed # CHECK: error: command failed with exit status: 1 # CHECK: *** -# CHECK: FAIL: shtest-shell :: diff-error-4.txt -# CHECK: *** TEST 'shtest-shell :: diff-error-4.txt' FAILED *** -# CHECK: Exit Code: 1 -# CHECK: # command output: -# CHECK: diff-error-4.txt.tmp -# CHECK: diff-error-4.txt.tmp1 -# CHECK: *** 1 **** -# CHECK: ! hello-first -# CHECK: --- 1 ---- -# CHECK: ! hello-second -# CHECK: *** +# CHECK: FAIL: shtest-shell :: diff-error-4.txt +# CHECK: *** TEST 'shtest-shell :: diff-error-4.txt' FAILED *** +# CHECK: Exit Code: 1 +# CHECK: # .---command stdout{{-*}} +# CHECK-NEXT: # | {{.*}}diff-error-4.txt.tmp +# CHECK-NEXT: # | {{.*}}diff-error-4.txt.tmp1 +# CHECK-NEXT: # | {{\*+}} +# CHECK-NEXT: # | *** 1 **** +# CHECK-NEXT: # | ! hello-first +# CHECK-NEXT: # | --- 1 ---- +# CHECK-NEXT: # | ! hello-second +# CHECK-NEXT: # `---{{-*}} +# CHECK: *** # CHECK: FAIL: shtest-shell :: diff-error-5.txt # CHECK: *** TEST 'shtest-shell :: diff-error-5.txt' FAILED *** -# CHECK: $ "diff" -# CHECK: # command stderr: -# CHECK: Error: missing or extra operand -# CHECK: error: command failed with exit status: 1 +# CHECK: diff +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: missing or extra operand +# CHECK: # error: command failed with exit status: 1 # CHECK: *** # CHECK: FAIL: shtest-shell :: diff-error-6.txt # CHECK: *** TEST 'shtest-shell :: diff-error-6.txt' FAILED *** -# CHECK: $ "diff" -# CHECK: # command stderr: -# CHECK: Error: missing or extra operand -# CHECK: error: command failed with exit status: 1 +# CHECK: diff +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: missing or extra operand +# CHECK: # error: command failed with exit status: 1 # CHECK: *** @@ -223,137 +233,130 @@ # CHECK: *** TEST 'shtest-shell :: diff-pipes.txt' FAILED *** -# CHECK: $ "diff" "{{[^"]*}}.foo" "{{[^"]*}}.foo" -# CHECK-NOT: note -# CHECK-NOT: error -# CHECK: $ "FileCheck" +# CHECK: diff {{[^ ]*}}.foo {{.*}}.foo | FileCheck {{.*}} # CHECK-NOT: note # CHECK-NOT: error -# CHECK: $ "diff" "-u" "{{[^"]*}}.foo" "{{[^"]*}}.bar" -# CHECK: note: command had no output on stdout or stderr -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "FileCheck" -# CHECK-NOT: note -# CHECK-NOT: error -# CHECK: $ "true" +# CHECK: diff -u {{.*}}.foo {{.*}}.bar | FileCheck {{.*}} && false || true +# CHECK-NEXT: # executed command: diff -u {{.+}}.foo{{.*}} {{.+}}.bar{{.*}} +# CHECK-NEXT: # note: command had no output on stdout or stderr +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: FileCheck +# CHECK-NEXT: # executed command: true -# CHECK: $ "cat" "{{[^"]*}}.foo" -# CHECK: $ "diff" "-u" "-" "{{[^"]*}}.foo" +# CHECK: cat {{.*}}.foo | diff -u - {{.*}}.foo # CHECK-NOT: note # CHECK-NOT: error -# CHECK: $ "cat" "{{[^"]*}}.foo" -# CHECK: $ "diff" "-u" "{{[^"]*}}.foo" "-" +# CHECK: cat {{.*}}.foo | diff -u {{.*}}.foo - # CHECK-NOT: note # CHECK-NOT: error -# CHECK: $ "cat" "{{[^"]*}}.bar" -# CHECK: $ "diff" "-u" "{{[^"]*}}.foo" "-" -# CHECK: # command output: -# CHECK: @@ -# CHECK-NEXT: -foo -# CHECK-NEXT: +bar -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "cat" "{{[^"]*}}.bar" -# CHECK: $ "diff" "-u" "-" "{{[^"]*}}.foo" -# CHECK: # command output: -# CHECK: @@ -# CHECK-NEXT: -bar -# CHECK-NEXT: +foo -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "cat" "{{[^"]*}}.foo" -# CHECK: $ "diff" "-" "{{[^"]*}}.foo" -# CHECK-NOT: note -# CHECK-NOT: error -# CHECK: $ "FileCheck" +# CHECK: cat {{.*}}.bar | diff -u {{.*}}.foo - && false || true +# CHECK-NEXT: # executed command: cat {{.+}}.bar{{.*}} +# CHECK-NEXT: # executed command: diff -u {{.+}}.foo{{.*}} - +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ +# CHECK-NEXT: # | -foo +# CHECK-NEXT: # | +bar +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: cat {{.*}}.bar | diff -u - {{.*}}.foo && false || true +# CHECK-NEXT: # executed command: cat {{.+}}.bar{{.*}} +# CHECK-NEXT: # executed command: diff -u - {{.+}}.foo{{.*}} +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ +# CHECK-NEXT: # | -bar +# CHECK-NEXT: # | +foo +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: cat {{.*}}.foo | diff - {{.*}}.foo | FileCheck {{.*}} # CHECK-NOT: note # CHECK-NOT: error -# CHECK: $ "cat" "{{[^"]*}}.bar" -# CHECK: $ "diff" "-u" "{{[^"]*}}.foo" "-" -# CHECK: note: command had no output on stdout or stderr -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "FileCheck" -# CHECK-NOT: note -# CHECK-NOT: error -# CHECK: $ "true" +# CHECK: cat {{.*}}.bar | diff -u {{.*}}.foo - | FileCheck {{.*}} +# CHECK-NEXT: # executed command: cat {{.+}}.bar{{.*}} +# CHECK-NEXT: # executed command: diff -u {{.+}}.foo{{.*}} - +# CHECK-NEXT: note: command had no output on stdout or stderr +# CHECK-NEXT: error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: FileCheck +# CHECK-NEXT: # executed command: true -# CHECK: $ "false" +# CHECK: false # CHECK: *** # CHECK: FAIL: shtest-shell :: diff-r-error-0.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-0.txt' FAILED *** -# CHECK: $ "diff" "-r" -# CHECK: # command output: -# CHECK: Only in {{.*}}dir1: dir1unique -# CHECK: Only in {{.*}}dir2: dir2unique -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r +# CHECK: # .---command stdout{{-*}} +# CHECK: # | Only in {{.*}}dir1: dir1unique +# CHECK: # | Only in {{.*}}dir2: dir2unique +# CHECK: # error: command failed with exit status: 1 # CHECK: FAIL: shtest-shell :: diff-r-error-1.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-1.txt' FAILED *** -# CHECK: $ "diff" "-r" -# CHECK: # command output: -# CHECK: *** {{.*}}dir1{{.*}}subdir{{.*}}f01 -# CHECK: --- {{.*}}dir2{{.*}}subdir{{.*}}f01 -# CHECK: 12345 -# CHECK: 00000 -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r +# CHECK: # .---command stdout{{-*}} +# CHECK: # | *** {{.*}}dir1{{.*}}subdir{{.*}}f01 +# CHECK: # | --- {{.*}}dir2{{.*}}subdir{{.*}}f01 +# CHECK: # | ! 12345 +# CHECK: # | ! 00000 +# CHECK: # error: command failed with exit status: 1 # CHECK: FAIL: shtest-shell :: diff-r-error-2.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-2.txt' FAILED *** -# CHECK: $ "diff" "-r" -# CHECK: # command output: -# CHECK: Only in {{.*}}dir2: extrafile -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r +# CHECK: # .---command stdout{{-*}} +# CHECK: # | Only in {{.*}}dir2: extrafile +# CHECK: # error: command failed with exit status: 1 # CHECK: FAIL: shtest-shell :: diff-r-error-3.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-3.txt' FAILED *** -# CHECK: $ "diff" "-r" -# CHECK: # command output: -# CHECK: Only in {{.*}}dir1: extra_subdir -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r +# CHECK: # .---command stdout{{-*}} +# CHECK: # | Only in {{.*}}dir1: extra_subdir +# CHECK: # error: command failed with exit status: 1 # CHECK: FAIL: shtest-shell :: diff-r-error-4.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-4.txt' FAILED *** -# CHECK: $ "diff" "-r" -# CHECK: # command output: -# CHECK: File {{.*}}dir1{{.*}}extra_subdir is a directory while file {{.*}}dir2{{.*}}extra_subdir is a regular file -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r +# CHECK: # .---command stdout{{-*}} +# CHECK: # | File {{.*}}dir1{{.*}}extra_subdir is a directory while file {{.*}}dir2{{.*}}extra_subdir is a regular file +# CHECK: # error: command failed with exit status: 1 # CHECK: FAIL: shtest-shell :: diff-r-error-5.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-5.txt' FAILED *** -# CHECK: $ "diff" "-r" -# CHECK: # command output: -# CHECK: Only in {{.*}}dir1: extra_subdir -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r +# CHECK: # .---command stdout{{-*}} +# CHECK: # | Only in {{.*}}dir1: extra_subdir +# CHECK: # error: command failed with exit status: 1 # CHECK: FAIL: shtest-shell :: diff-r-error-6.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-6.txt' FAILED *** -# CHECK: $ "diff" "-r" -# CHECK: # command output: -# CHECK: File {{.*}}dir1{{.*}}extra_file is a regular empty file while file {{.*}}dir2{{.*}}extra_file is a directory -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r +# CHECK: # .---command stdout{{-*}} +# CHECK: # | File {{.*}}dir1{{.*}}extra_file is a regular empty file while file {{.*}}dir2{{.*}}extra_file is a directory +# CHECK: # error: command failed with exit status: 1 # CHECK: FAIL: shtest-shell :: diff-r-error-7.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-7.txt' FAILED *** -# CHECK: $ "diff" "-r" "-" "{{[^"]*}}" -# CHECK: # command stderr: -# CHECK: Error: cannot recursively compare '-' -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r - {{.*}} +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: cannot recursively compare '-' +# CHECK: # error: command failed with exit status: 1 # CHECK: FAIL: shtest-shell :: diff-r-error-8.txt # CHECK: *** TEST 'shtest-shell :: diff-r-error-8.txt' FAILED *** -# CHECK: $ "diff" "-r" "{{[^"]*}}" "-" -# CHECK: # command stderr: -# CHECK: Error: cannot recursively compare '-' -# CHECK: error: command failed with exit status: 1 +# CHECK: diff -r {{.*}} - +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: cannot recursively compare '-' +# CHECK: # error: command failed with exit status: 1 # CHECK: PASS: shtest-shell :: diff-r.txt @@ -362,51 +365,59 @@ # CHECK: *** TEST 'shtest-shell :: diff-strip-trailing-cr.txt' FAILED *** -# CHECK: $ "diff" "-u" "diff-in.dos" "diff-in.unix" -# CHECK: # command output: -# CHECK: @@ -# CHECK-NEXT: -In this file, the -# CHECK-NEXT: -sequence "\r\n" -# CHECK-NEXT: -terminates lines. -# CHECK-NEXT: +In this file, the -# CHECK-NEXT: +sequence "\n" -# CHECK-NEXT: +terminates lines. -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "diff" "-u" "diff-in.unix" "diff-in.dos" -# CHECK: # command output: -# CHECK: @@ -# CHECK-NEXT: -In this file, the -# CHECK-NEXT: -sequence "\n" -# CHECK-NEXT: -terminates lines. -# CHECK-NEXT: +In this file, the -# CHECK-NEXT: +sequence "\r\n" -# CHECK-NEXT: +terminates lines. -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "diff" "-u" "--strip-trailing-cr" "diff-in.dos" "diff-in.unix" -# CHECK: # command output: -# CHECK: @@ -# CHECK-NEXT: In this file, the -# CHECK-NEXT: -sequence "\r\n" -# CHECK-NEXT: +sequence "\n" -# CHECK-NEXT: terminates lines. -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "diff" "-u" "--strip-trailing-cr" "diff-in.unix" "diff-in.dos" -# CHECK: # command output: -# CHECK: @@ -# CHECK-NEXT: In this file, the -# CHECK-NEXT: -sequence "\n" -# CHECK-NEXT: +sequence "\r\n" -# CHECK-NEXT: terminates lines. -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "false" +# CHECK: diff -u diff-in.dos diff-in.unix && false || true +# CHECK-NEXT: # executed command: diff -u diff-in.dos diff-in.unix +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ +# CHECK-NEXT: # | -In this file, the +# CHECK-NEXT: # | -sequence "\r\n" +# CHECK-NEXT: # | -terminates lines. +# CHECK-NEXT: # | +In this file, the +# CHECK-NEXT: # | +sequence "\n" +# CHECK-NEXT: # | +terminates lines. +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -u diff-in.unix diff-in.dos && false || true +# CHECK-NEXT: # executed command: diff -u diff-in.unix diff-in.dos +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ +# CHECK-NEXT: # | -In this file, the +# CHECK-NEXT: # | -sequence "\n" +# CHECK-NEXT: # | -terminates lines. +# CHECK-NEXT: # | +In this file, the +# CHECK-NEXT: # | +sequence "\r\n" +# CHECK-NEXT: # | +terminates lines. +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -u --strip-trailing-cr diff-in.dos diff-in.unix && false || true +# CHECK-NEXT: executed command: diff -u --strip-trailing-cr diff-in.dos diff-in.unix +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ +# CHECK-NEXT: # | In this file, the +# CHECK-NEXT: # | -sequence "\r\n" +# CHECK-NEXT: # | +sequence "\n" +# CHECK-NEXT: # | terminates lines. +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -u --strip-trailing-cr diff-in.unix diff-in.dos && false || true +# CHECK-NEXT: # executed command: diff -u --strip-trailing-cr diff-in.unix diff-in.dos +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ +# CHECK-NEXT: # | In this file, the +# CHECK-NEXT: # | -sequence "\n" +# CHECK-NEXT: # | +sequence "\r\n" +# CHECK-NEXT: # | terminates lines. +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: false # CHECK: *** @@ -415,106 +426,144 @@ # CHECK: *** TEST 'shtest-shell :: diff-unified.txt' FAILED *** -# CHECK: $ "diff" "-u" "{{[^"]*}}.foo" "{{[^"]*}}.bar" -# CHECK: # command output: -# CHECK: @@ {{.*}} @@ -# CHECK-NEXT: 3 -# CHECK-NEXT: 4 -# CHECK-NEXT: 5 -# CHECK-NEXT: -6 foo -# CHECK-NEXT: +6 bar -# CHECK-NEXT: 7 -# CHECK-NEXT: 8 -# CHECK-NEXT: 9 -# CHECK-EMPTY: -# CHECK-NEXT: error: command failed with exit status: 1 -# CHECK-NEXT: $ "true" - -# CHECK: $ "diff" "-U" "2" "{{[^"]*}}.foo" "{{[^"]*}}.bar" -# CHECK: # command output: -# CHECK: @@ {{.*}} @@ -# CHECK-NEXT: 4 -# CHECK-NEXT: 5 -# CHECK-NEXT: -6 foo -# CHECK-NEXT: +6 bar -# CHECK-NEXT: 7 -# CHECK-NEXT: 8 -# CHECK-EMPTY: -# CHECK-NEXT: error: command failed with exit status: 1 -# CHECK-NEXT: $ "true" - -# CHECK: $ "diff" "-U4" "{{[^"]*}}.foo" "{{[^"]*}}.bar" -# CHECK: # command output: -# CHECK: @@ {{.*}} @@ -# CHECK-NEXT: 2 -# CHECK-NEXT: 3 -# CHECK-NEXT: 4 -# CHECK-NEXT: 5 -# CHECK-NEXT: -6 foo -# CHECK-NEXT: +6 bar -# CHECK-NEXT: 7 -# CHECK-NEXT: 8 -# CHECK-NEXT: 9 -# CHECK-NEXT: 10 -# CHECK-EMPTY: -# CHECK-NEXT: error: command failed with exit status: 1 -# CHECK-NEXT: $ "true" - -# CHECK: $ "diff" "-U0" "{{[^"]*}}.foo" "{{[^"]*}}.bar" -# CHECK: # command output: -# CHECK: @@ {{.*}} @@ -# CHECK-NEXT: -6 foo -# CHECK-NEXT: +6 bar -# CHECK-EMPTY: -# CHECK-NEXT: error: command failed with exit status: 1 -# CHECK-NEXT: $ "true" - -# CHECK: $ "diff" "-U" "30.1" "{{[^"]*}}" "{{[^"]*}}" -# CHECK: # command stderr: -# CHECK: Error: invalid '-U' argument: 30.1 -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "diff" "-U-1" "{{[^"]*}}" "{{[^"]*}}" -# CHECK: # command stderr: -# CHECK: Error: invalid '-U' argument: -1 -# CHECK: error: command failed with exit status: 1 -# CHECK: $ "true" - -# CHECK: $ "false" +# CHECK: diff -u {{.*}}.foo {{.*}}.bar && false || true +# CHECK-NEXT: # executed command: diff -u {{.+}}.foo{{.*}} {{.+}}.bar{{.*}} +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ {{.*}} @@ +# CHECK-NEXT: # | 3 +# CHECK-NEXT: # | 4 +# CHECK-NEXT: # | 5 +# CHECK-NEXT: # | -6 foo +# CHECK-NEXT: # | +6 bar +# CHECK-NEXT: # | 7 +# CHECK-NEXT: # | 8 +# CHECK-NEXT: # | 9 +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -U 2 {{.*}}.foo {{.*}}.bar && false || true +# CHECK-NEXT: # executed command: diff -U 2 {{.+}}.foo{{.*}} {{.+}}.bar{{.*}} +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ {{.*}} @@ +# CHECK-NEXT: # | 4 +# CHECK-NEXT: # | 5 +# CHECK-NEXT: # | -6 foo +# CHECK-NEXT: # | +6 bar +# CHECK-NEXT: # | 7 +# CHECK-NEXT: # | 8 +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -U4 {{.*}}.foo {{.*}}.bar && false || true +# CHECK-NEXT: # executed command: diff -U4 {{.+}}.foo{{.*}} {{.+}}.bar{{.*}} +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ {{.*}} @@ +# CHECK-NEXT: # | 2 +# CHECK-NEXT: # | 3 +# CHECK-NEXT: # | 4 +# CHECK-NEXT: # | 5 +# CHECK-NEXT: # | -6 foo +# CHECK-NEXT: # | +6 bar +# CHECK-NEXT: # | 7 +# CHECK-NEXT: # | 8 +# CHECK-NEXT: # | 9 +# CHECK-NEXT: # | 10 +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -U0 {{.*}}.foo {{.*}}.bar && false || true +# CHECK-NEXT: # executed command: diff -U0 {{.+}}.foo{{.*}} {{.+}}.bar{{.*}} +# CHECK-NEXT: # .---command stdout{{-*}} +# CHECK: # | @@ {{.*}} @@ +# CHECK-NEXT: # | -6 foo +# CHECK-NEXT: # | +6 bar +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK-NEXT: # executed command: true + +# CHECK: diff -U 30.1 {{.*}} {{.*}} && false || true +# CHECK: # executed command: diff -U 30.1 {{.*}} {{.*}} +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: invalid '-U' argument: 30.1 +# CHECK: # error: command failed with exit status: 1 +# CHECK: # executed command: true + +# CHECK: diff -U-1 {{.*}} {{.*}} && false || true +# CHECK: # executed command: diff -U-1 {{.*}} {{.*}} +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: invalid '-U' argument: -1 +# CHECK: # error: command failed with exit status: 1 +# CHECK: # executed command: true + +# CHECK: false # CHECK: *** -# CHECK: FAIL: shtest-shell :: diff-w.txt -# CHECK: *** TEST 'shtest-shell :: diff-w.txt' FAILED *** -# CHECK: $ "diff" "-w" "{{[^"]*}}.0" "{{[^"]*}}.1" -# CHECK: # command output: -# CHECK: 1,3 -# CHECK-NEXT: {{^ }}foo -# CHECK-NEXT: {{^ }}bar -# CHECK-NEXT: ! baz -# CHECK-NEXT: --- -# CHECK-NEXT: {{^ }}foo -# CHECK-NEXT: {{^ }}bar -# CHECK-NEXT: ! bat -# CHECK-EMPTY: -# CHECK: error: command failed with exit status: 1 -# CHECK: *** +# CHECK: FAIL: shtest-shell :: diff-w.txt +# CHECK: *** TEST 'shtest-shell :: diff-w.txt' FAILED *** +# CHECK: diff -w {{.*}}.0 {{.*}}.1 +# CHECK: # .---command stdout{{-*}} +# CHECK: # | {{\*+}} 1,3 +# CHECK-NEXT: # | foo +# CHECK-NEXT: # | bar +# CHECK-NEXT: # | ! baz +# CHECK-NEXT: # | --- +# CHECK-NEXT: # | foo +# CHECK-NEXT: # | bar +# CHECK-NEXT: # | ! bat +# CHECK-NEXT: # `---{{-*}} +# CHECK-NEXT: # error: command failed with exit status: 1 +# CHECK: *** + +# CHECK: FAIL: shtest-shell :: echo-at-redirect-stderr.txt +# CHECK: *** TEST 'shtest-shell :: echo-at-redirect-stderr.txt' FAILED *** +# CHECK: @echo 2> {{.*}} +# CHECK: # executed command: @echo +# CHECK: # .---command stderr{{-*}} +# CHECK: # | stdin and stderr redirects not supported for @echo +# CHECK: error: command failed with exit status: + +# CHECK: FAIL: shtest-shell :: echo-at-redirect-stdin.txt +# CHECK: *** TEST 'shtest-shell :: echo-at-redirect-stdin.txt' FAILED *** +# CHECK: @echo < {{.*}} +# CHECK: # executed command: @echo +# CHECK: # .---command stderr{{-*}} +# CHECK: # | stdin and stderr redirects not supported for @echo +# CHECK: error: command failed with exit status: + +# CHECK: FAIL: shtest-shell :: echo-redirect-stderr.txt +# CHECK: *** TEST 'shtest-shell :: echo-redirect-stderr.txt' FAILED *** +# CHECK: echo 2> {{.*}} +# CHECK: # executed command: echo +# CHECK: # .---command stderr{{-*}} +# CHECK: # | stdin and stderr redirects not supported for echo +# CHECK: error: command failed with exit status: + +# CHECK: FAIL: shtest-shell :: echo-redirect-stdin.txt +# CHECK: *** TEST 'shtest-shell :: echo-redirect-stdin.txt' FAILED *** +# CHECK: echo < {{.*}} +# CHECK: # executed command: echo +# CHECK: # .---command stderr{{-*}} +# CHECK: # | stdin and stderr redirects not supported for echo +# CHECK: error: command failed with exit status: # CHECK: FAIL: shtest-shell :: error-0.txt # CHECK: *** TEST 'shtest-shell :: error-0.txt' FAILED *** -# CHECK: $ "not-a-real-command" -# CHECK: # command stderr: -# CHECK: 'not-a-real-command': command not found -# CHECK: error: command failed with exit status: 127 +# CHECK: not-a-real-command +# CHECK: # .---command stderr{{-*}} +# CHECK: # | 'not-a-real-command': command not found +# CHECK: # error: command failed with exit status: 127 # CHECK: *** # FIXME: The output here sucks. # # CHECK: FAIL: shtest-shell :: error-1.txt # CHECK: *** TEST 'shtest-shell :: error-1.txt' FAILED *** -# CHECK: shell parser error on: ': \'RUN: at line 3\'; echo "missing quote' +# CHECK: shell parser error on RUN: at line 3: echo "missing quote # CHECK: *** # CHECK: FAIL: shtest-shell :: error-2.txt @@ -524,52 +573,52 @@ # CHECK: FAIL: shtest-shell :: mkdir-error-0.txt # CHECK: *** TEST 'shtest-shell :: mkdir-error-0.txt' FAILED *** -# CHECK: $ "mkdir" "-p" "temp" -# CHECK: # command stderr: -# CHECK: Unsupported: 'mkdir' cannot be part of a pipeline -# CHECK: error: command failed with exit status: 127 +# CHECK: mkdir -p temp | rm -rf temp +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Unsupported: 'mkdir' cannot be part of a pipeline +# CHECK: # error: command failed with exit status: 127 # CHECK: *** # CHECK: FAIL: shtest-shell :: mkdir-error-1.txt # CHECK: *** TEST 'shtest-shell :: mkdir-error-1.txt' FAILED *** -# CHECK: $ "mkdir" "-p" "-m" "777" "temp" -# CHECK: # command stderr: -# CHECK: Unsupported: 'mkdir': option -m not recognized -# CHECK: error: command failed with exit status: 127 +# CHECK: mkdir -p -m 777 temp +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Unsupported: 'mkdir': option -m not recognized +# CHECK: # error: command failed with exit status: 127 # CHECK: *** # CHECK: FAIL: shtest-shell :: mkdir-error-2.txt # CHECK: *** TEST 'shtest-shell :: mkdir-error-2.txt' FAILED *** -# CHECK: $ "mkdir" "-p" -# CHECK: # command stderr: -# CHECK: Error: 'mkdir' is missing an operand -# CHECK: error: command failed with exit status: 127 +# CHECK: mkdir -p +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: 'mkdir' is missing an operand +# CHECK: # error: command failed with exit status: 127 # CHECK: *** # CHECK: PASS: shtest-shell :: redirects.txt # CHECK: FAIL: shtest-shell :: rm-error-0.txt # CHECK: *** TEST 'shtest-shell :: rm-error-0.txt' FAILED *** -# CHECK: $ "rm" "-rf" "temp" -# CHECK: # command stderr: -# CHECK: Unsupported: 'rm' cannot be part of a pipeline -# CHECK: error: command failed with exit status: 127 +# CHECK: rm -rf temp | echo "hello" +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Unsupported: 'rm' cannot be part of a pipeline +# CHECK: # error: command failed with exit status: 127 # CHECK: *** # CHECK: FAIL: shtest-shell :: rm-error-1.txt # CHECK: *** TEST 'shtest-shell :: rm-error-1.txt' FAILED *** -# CHECK: $ "rm" "-f" "-v" "temp" -# CHECK: # command stderr: -# CHECK: Unsupported: 'rm': option -v not recognized -# CHECK: error: command failed with exit status: 127 +# CHECK: rm -f -v temp +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Unsupported: 'rm': option -v not recognized +# CHECK: # error: command failed with exit status: 127 # CHECK: *** # CHECK: FAIL: shtest-shell :: rm-error-2.txt # CHECK: *** TEST 'shtest-shell :: rm-error-2.txt' FAILED *** -# CHECK: $ "rm" "-r" "hello" -# CHECK: # command stderr: -# CHECK: Error: 'rm' command failed -# CHECK: error: command failed with exit status: 1 +# CHECK: rm -r hello +# CHECK: # .---command stderr{{-*}} +# CHECK: # | Error: 'rm' command failed +# CHECK: # error: command failed with exit status: 1 # CHECK: *** # CHECK: FAIL: shtest-shell :: rm-error-3.txt @@ -581,16 +630,17 @@ # CHECK: PASS: shtest-shell :: sequencing-0.txt # CHECK: XFAIL: shtest-shell :: sequencing-1.txt -# CHECK: FAIL: shtest-shell :: stdout-encoding.txt -# CHECK: *** TEST 'shtest-shell :: stdout-encoding.txt' FAILED *** -# CHECK: $ "cat" "diff-in.bin" -# CHECK: # command output: -# CHECK-NEXT: {{^.f.o.o.$}} -# CHECK-NEXT: {{^.b.a.r.}} -# CHECK-NEXT: {{^.b.a.z.$}} -# CHECK-NOT: error -# CHECK: $ "false" -# CHECK: *** +# CHECK: FAIL: shtest-shell :: stdout-encoding.txt +# CHECK: *** TEST 'shtest-shell :: stdout-encoding.txt' FAILED *** +# CHECK: cat diff-in.bin +# CHECK: # .---command stdout{{-*}} +# CHECK-NEXT: # | {{.f.o.o.$}} +# CHECK-NEXT: # | {{.b.a.r.}} +# CHECK-NEXT: # | {{.b.a.z.$}} +# CHECK-NEXT: # `---{{-*}} +# CHECK-NOT: error +# CHECK: false +# CHECK: *** # CHECK: PASS: shtest-shell :: valid-shell.txt -# CHECK: Failed Tests (35) +# CHECK: Failed Tests (39) diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h index b5c6a3094bc67..68eccab6dbaca 100644 --- a/mlir/include/mlir-c/IR.h +++ b/mlir/include/mlir-c/IR.h @@ -48,6 +48,7 @@ extern "C" { }; \ typedef struct name name +DEFINE_C_API_STRUCT(MlirAsmState, void); DEFINE_C_API_STRUCT(MlirBytecodeWriterConfig, void); DEFINE_C_API_STRUCT(MlirContext, void); DEFINE_C_API_STRUCT(MlirDialect, void); @@ -383,6 +384,29 @@ mlirOperationStateAddAttributes(MlirOperationState *state, intptr_t n, MLIR_CAPI_EXPORTED void mlirOperationStateEnableResultTypeInference(MlirOperationState *state); +//===----------------------------------------------------------------------===// +// AsmState API. +// While many of these are simple settings that could be represented in a +// struct, they are wrapped in a heap allocated object and accessed via +// functions to maximize the possibility of compatibility over time. +//===----------------------------------------------------------------------===// + +/// Creates new AsmState, as with AsmState the IR should not be mutated +/// in-between using this state. +/// Must be freed with a call to mlirAsmStateDestroy(). +// TODO: This should be expanded to handle location & resouce map. +MLIR_CAPI_EXPORTED MlirAsmState +mlirAsmStateCreateForOperation(MlirOperation op, MlirOpPrintingFlags flags); + +/// Creates new AsmState from value. +/// Must be freed with a call to mlirAsmStateDestroy(). +// TODO: This should be expanded to handle location & resouce map. +MLIR_CAPI_EXPORTED MlirAsmState +mlirAsmStateCreateForValue(MlirValue value, MlirOpPrintingFlags flags); + +/// Destroys printing flags created with mlirAsmStateCreate. +MLIR_CAPI_EXPORTED void mlirAsmStateDestroy(MlirAsmState state); + //===----------------------------------------------------------------------===// // Op Printing flags API. // While many of these are simple settings that could be represented in a @@ -815,7 +839,7 @@ mlirValuePrint(MlirValue value, MlirStringCallback callback, void *userData); /// Prints a value as an operand (i.e., the ValueID). MLIR_CAPI_EXPORTED void mlirValuePrintAsOperand(MlirValue value, - MlirOpPrintingFlags flags, + MlirAsmState state, MlirStringCallback callback, void *userData); diff --git a/mlir/include/mlir/CAPI/IR.h b/mlir/include/mlir/CAPI/IR.h index b8ccec896c27b..1836cb0acb67e 100644 --- a/mlir/include/mlir/CAPI/IR.h +++ b/mlir/include/mlir/CAPI/IR.h @@ -21,6 +21,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" +DEFINE_C_API_PTR_METHODS(MlirAsmState, mlir::AsmState) DEFINE_C_API_PTR_METHODS(MlirBytecodeWriterConfig, mlir::BytecodeWriterConfig) DEFINE_C_API_PTR_METHODS(MlirContext, mlir::MLIRContext) DEFINE_C_API_PTR_METHODS(MlirDialect, mlir::Dialect) diff --git a/mlir/include/mlir/Dialect/MLProgram/CMakeLists.txt b/mlir/include/mlir/Dialect/MLProgram/CMakeLists.txt index f33061b2d87cf..9f57627c321fb 100644 --- a/mlir/include/mlir/Dialect/MLProgram/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/MLProgram/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(IR) +add_subdirectory(Transforms) diff --git a/mlir/include/mlir/Dialect/MLProgram/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/MLProgram/Transforms/CMakeLists.txt new file mode 100644 index 0000000000000..c5c11f17a9fa9 --- /dev/null +++ b/mlir/include/mlir/Dialect/MLProgram/Transforms/CMakeLists.txt @@ -0,0 +1,6 @@ +set(LLVM_TARGET_DEFINITIONS Passes.td) +mlir_tablegen(Passes.h.inc -gen-pass-decls -name MLProgram) +add_public_tablegen_target(MLIRMLProgramPassIncGen) +add_dependencies(mlir-headers MLIRMLProgramPassIncGen) + +add_mlir_doc(Passes MLProgramPasses ./ -gen-pass-doc) diff --git a/mlir/include/mlir/Dialect/MLProgram/Transforms/Passes.h b/mlir/include/mlir/Dialect/MLProgram/Transforms/Passes.h new file mode 100644 index 0000000000000..894e35e52724e --- /dev/null +++ b/mlir/include/mlir/Dialect/MLProgram/Transforms/Passes.h @@ -0,0 +1,35 @@ +//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_MLPROGRAM_TRANSFORMS_PASSES_H_ +#define MLIR_DIALECT_MLPROGRAM_TRANSFORMS_PASSES_H_ + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" + +namespace mlir { +namespace ml_program { + +#define GEN_PASS_DECL +#include "mlir/Dialect/MLProgram/Transforms/Passes.h.inc" + +//===----------------------------------------------------------------------===// +// Registration +//===----------------------------------------------------------------------===// + +std::unique_ptr> createMLProgramPipelineGlobalsPass(); + +/// Generate the code for registering passes. +#define GEN_PASS_REGISTRATION +#include "mlir/Dialect/MLProgram/Transforms/Passes.h.inc" + +} // namespace ml_program +} // namespace mlir + +#endif // MLIR_DIALECT_MLPROGRAM_TRANSFORMS_PASSES_H_ diff --git a/mlir/include/mlir/Dialect/MLProgram/Transforms/Passes.td b/mlir/include/mlir/Dialect/MLProgram/Transforms/Passes.td new file mode 100644 index 0000000000000..defe8191cb905 --- /dev/null +++ b/mlir/include/mlir/Dialect/MLProgram/Transforms/Passes.td @@ -0,0 +1,27 @@ +//===-- Passes.td - pass definition file -------------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_MLPROGRAM_TRANSFORMS_PASSES +#define MLIR_DIALECT_MLPROGRAM_TRANSFORMS_PASSES + +include "mlir/Pass/PassBase.td" + +def MLProgramPipelineGlobals : Pass<"mlprogram-pipeline-globals", "ModuleOp"> { + let summary = "Optimize `ml_program` global operations for read and store"; + let description = [{ + `ml_program`'s load and store operations can be optimized for + write-write or write-read sets of operations. This allows known + tensors to not be re-read when the value is already known in IR. + + The pass is designed to handle both nested regions and function calls + safely. + }]; + let constructor = "mlir::ml_program::createMLProgramPipelineGlobalsPass()"; +} + +#endif // MLIR_DIALECT_MLPROGRAM_TRANSFORMS_PASSES diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt index 258b87d7471d3..419e24a733536 100644 --- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt @@ -7,6 +7,8 @@ mlir_tablegen(OpenMPOpsDialect.h.inc -gen-dialect-decls -dialect=omp) mlir_tablegen(OpenMPOpsDialect.cpp.inc -gen-dialect-defs -dialect=omp) mlir_tablegen(OpenMPOps.h.inc -gen-op-decls) mlir_tablegen(OpenMPOps.cpp.inc -gen-op-defs) +mlir_tablegen(OpenMPOpsTypes.h.inc -gen-typedef-decls -typedefs-dialect=omp) +mlir_tablegen(OpenMPOpsTypes.cpp.inc -gen-typedef-defs -typedefs-dialect=omp) mlir_tablegen(OpenMPOpsEnums.h.inc -gen-enum-decls) mlir_tablegen(OpenMPOpsEnums.cpp.inc -gen-enum-defs) mlir_tablegen(OpenMPOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=omp) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h index 584ddc170c2f4..23509c5b60701 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h @@ -22,6 +22,9 @@ #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#define GET_TYPEDEF_CLASSES +#include "mlir/Dialect/OpenMP/OpenMPOpsTypes.h.inc" + #include "mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc" #include "mlir/Dialect/OpenMP/OpenMPOpsEnums.h.inc" #include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.h.inc" diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 86fe62e77b535..bcd60d8046c89 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -29,6 +29,7 @@ def OpenMP_Dialect : Dialect { let cppNamespace = "::mlir::omp"; let dependentDialects = ["::mlir::LLVM::LLVMDialect, ::mlir::func::FuncDialect"]; let useDefaultAttributePrinterParser = 1; + let useDefaultTypePrinterParser = 1; } // OmpCommon requires definition of OpenACC_Dialect. @@ -89,6 +90,10 @@ def IntLikeType : AnyTypeOf<[AnyInteger, Index]>; def OpenMP_PointerLikeType : TypeAlias; +class OpenMP_Type : TypeDef { + let mnemonic = typeMnemonic; +} + //===----------------------------------------------------------------------===// // 2.12.7 Declare Target Directive //===----------------------------------------------------------------------===// @@ -1004,6 +1009,220 @@ def FlushOp : OpenMP_Op<"flush"> { }]; } +//===----------------------------------------------------------------------===// +// Map related constructs +//===----------------------------------------------------------------------===// + +def CaptureThis : I32EnumAttrCase<"This", 0>; +def CaptureByRef : I32EnumAttrCase<"ByRef", 1>; +def CaptureByCopy : I32EnumAttrCase<"ByCopy", 2>; +def CaptureVLAType : I32EnumAttrCase<"VLAType", 3>; + +def VariableCaptureKind : I32EnumAttr< + "VariableCaptureKind", + "variable capture kind", + [CaptureThis, CaptureByRef, CaptureByCopy, CaptureVLAType]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::omp"; +} + +def VariableCaptureKindAttr : EnumAttr { + let assemblyFormat = "`(` $value `)`"; +} + +def DataBoundsType : OpenMP_Type<"DataBounds", "data_bounds_ty"> { + let summary = "Type for representing omp data clause bounds information"; +} + +def DataBoundsOp : OpenMP_Op<"bounds", + [AttrSizedOperandSegments, NoMemoryEffect]> { + let summary = "Represents normalized bounds information for map clauses."; + + let description = [{ + This operation is a variation on the OpenACC dialects DataBoundsOp. Within + the OpenMP dialect it stores the bounds/range of data to be mapped to a + device specified by map clauses on target directives. Within, + the OpenMP dialect the DataBoundsOp is associated with MapInfoOp, + helping to store bounds information for the mapped variable. + + It is used to support OpenMP array sectioning, Fortran pointer and + allocatable mapping and pointer/allocatable member of derived types. + In all cases the DataBoundsOp holds information on the section of + data to be mapped. Such as the upper bound and lower bound of the + section of data to be mapped. This information is currently + utilised by the LLVM-IR lowering to help generate instructions to + copy data to and from the device when processing target operations. + + The example below copys a section of a 10-element array; all except the + first element, utilising OpenMP array sectioning syntax where array + subscripts are provided to specify the bounds to be mapped to device. + To simplify the examples, the constants are used directly, in reality + they will be MLIR SSA values. + + C++: + ``` + int array[10]; + #pragma target map(array[1:9]) + ``` + => + ```mlir + omp.bounds lower_bound(1) upper_bound(9) extent(9) start_idx(0) + ``` + + Fortran: + ``` + integer :: array(1:10) + !$target map(array(2:10)) + ``` + => + ```mlir + omp.bounds lower_bound(1) upper_bound(9) extent(9) start_idx(1) + ``` + + For Fortran pointers and allocatables (as well as those that are + members of derived types) the bounds information is provided by + the Fortran compiler and runtime through descriptor information. + + A basic pointer example can be found below (constants again + provided for simplicity, where in reality SSA values will be + used, in this case that point to data yielded by Fortran's + descriptors): + + Fortran: + ``` + integer, pointer :: ptr(:) + allocate(ptr(10)) + !$target map(ptr) + ``` + => + ```mlir + omp.bounds lower_bound(0) upper_bound(9) extent(10) start_idx(1) + ``` + + This operation records the bounds information in a normalized fashion + (zero-based). This works well with the `PointerLikeType` + requirement in data clauses - since a `lower_bound` of 0 means looking + at data at the zero offset from pointer. + + This operation must have an `upper_bound` or `extent` (or both are allowed - + but not checked for consistency). When the source language's arrays are + not zero-based, the `start_idx` must specify the zero-position index. + }]; + + let arguments = (ins Optional:$lower_bound, + Optional:$upper_bound, + Optional:$extent, + Optional:$stride, + DefaultValuedAttr:$stride_in_bytes, + Optional:$start_idx); + let results = (outs DataBoundsType:$result); + + let assemblyFormat = [{ + oilist( + `lower_bound` `(` $lower_bound `:` type($lower_bound) `)` + | `upper_bound` `(` $upper_bound `:` type($upper_bound) `)` + | `extent` `(` $extent `:` type($extent) `)` + | `stride` `(` $stride `:` type($stride) `)` + | `start_idx` `(` $start_idx `:` type($start_idx) `)` + ) attr-dict + }]; + + let extraClassDeclaration = [{ + /// The number of variable operands. + unsigned getNumVariableOperands() { + return getNumOperands(); + } + + /// The i-th variable operand passed. + Value getVariableOperand(unsigned i) { + return getOperands()[i]; + } + }]; + + let hasVerifier = 1; +} + +def MapInfoOp : OpenMP_Op<"map_info", [AttrSizedOperandSegments]> { + let arguments = (ins OpenMP_PointerLikeType:$var_ptr, + Optional:$var_ptr_ptr, + Variadic:$bounds, /* rank-0 to rank-{n-1} */ + OptionalAttr:$map_type, + OptionalAttr:$map_capture_type, + DefaultValuedAttr:$implicit, + OptionalAttr:$name); + let results = (outs OpenMP_PointerLikeType:$omp_ptr); + + let description = [{ + The MapInfoOp captures information relating to individual OpenMP map clauses + that are applied to certain OpenMP directives such as Target and Target Data. + + For example, the map type modifier; such as from, tofrom and to, the variable + being captured or the bounds of an array section being mapped. + + It can be used to capture both implicit and explicit map information, where + explicit is an argument directly specified to an OpenMP map clause or implicit + where a variable is utilised in a target region but is defined externally to + the target region. + + This map information is later used to aid the lowering of the target operations + they are attached to providing argument input and output context for kernels + generated or the target data mapping environment. + + Example (Fortran): + + ``` + integer :: index + !$target map(to: index) + ``` + => + ```mlir + omp.map_info var_ptr(%index_ssa) map_type(to) map_capture_type(ByRef) implicit(false) + name(index) + ``` + + Description of arguments: + - `var_ptr`: The address of variable to copy. + - `var_ptr_ptr`: Used when the variable copied is a member of a class, structure + or derived type and refers to the originating struct. + - `bounds`: Used when copying slices of array's, pointers or pointer members of + objects (e.g. derived types or classes), indicates the bounds to be copied + of the variable. When it's an array slice it is in rank order where rank 0 + is the inner-most dimension. + - `implicit`: indicates where the map item has been specified explicitly in a + map clause or captured implicitly by being used in a target region with no + map or other data mapping construct. + - 'map_clauses': OpenMP map type for this map capture, for example: from, to and + always. It's a bitfield composed of the OpenMP runtime flags stored in + OpenMPOffloadMappingFlags. + - 'map_capture_type': Capture type for the variable e.g. this, byref, byvalue, byvla + this can affect how the variable is lowered. + - `name`: Holds the name of variable as specified in user clause (including bounds). + }]; + + let assemblyFormat = [{ + `var_ptr` `(` $var_ptr `:` type($var_ptr) `)` + oilist( + `var_ptr_ptr` `(` $var_ptr_ptr `:` type($var_ptr_ptr) `)` + | `map_clauses` `(` custom($map_type) `)` + | `capture` `(` custom($map_capture_type) `)` + | `bounds` `(` $bounds `)` + ) `->` type($omp_ptr) attr-dict + }]; + + let extraClassDeclaration = [{ + /// The number of variable operands. + unsigned getNumVariableOperands() { + return getNumOperands(); + } + + /// The i-th variable operand passed. + Value getVariableOperand(unsigned i) { + return getOperands()[i]; + } + }]; +} + //===---------------------------------------------------------------------===// // 2.14.2 target data Construct //===---------------------------------------------------------------------===// @@ -1044,16 +1263,14 @@ def Target_DataOp: OpenMP_Op<"target_data", [AttrSizedOperandSegments]>{ Optional:$device, Variadic:$use_device_ptr, Variadic:$use_device_addr, - Variadic:$map_operands, - OptionalAttr:$map_types); + Variadic:$map_operands); let regions = (region AnyRegion:$region); let assemblyFormat = [{ oilist(`if` `(` $if_expr `:` type($if_expr) `)` | `device` `(` $device `:` type($device) `)` - | `map` - `(` custom($map_operands, type($map_operands), $map_types) `)` + | `map_entries` `(` $map_operands `:` type($map_operands) `)` | `use_device_ptr` `(` $use_device_ptr `:` type($use_device_ptr) `)` | `use_device_addr` `(` $use_device_addr `:` type($use_device_addr) `)`) $region attr-dict @@ -1095,15 +1312,14 @@ def Target_EnterDataOp: OpenMP_Op<"target_enter_data", let arguments = (ins Optional:$if_expr, Optional:$device, UnitAttr:$nowait, - Variadic:$map_operands, - I64ArrayAttr:$map_types); + Variadic:$map_operands); let assemblyFormat = [{ oilist(`if` `(` $if_expr `:` type($if_expr) `)` | `device` `(` $device `:` type($device) `)` - | `nowait` $nowait) - `map` `(` custom($map_operands, type($map_operands), $map_types) `)` - attr-dict + | `nowait` $nowait + | `map_entries` `(` $map_operands `:` type($map_operands) `)` + ) attr-dict }]; let hasVerifier = 1; @@ -1142,15 +1358,14 @@ def Target_ExitDataOp: OpenMP_Op<"target_exit_data", let arguments = (ins Optional:$if_expr, Optional:$device, UnitAttr:$nowait, - Variadic:$map_operands, - I64ArrayAttr:$map_types); + Variadic:$map_operands); let assemblyFormat = [{ oilist(`if` `(` $if_expr `:` type($if_expr) `)` | `device` `(` $device `:` type($device) `)` - | `nowait` $nowait) - `map` `(` custom($map_operands, type($map_operands), $map_types) `)` - attr-dict + | `nowait` $nowait + | `map_entries` `(` $map_operands `:` type($map_operands) `)` + ) attr-dict }]; let hasVerifier = 1; @@ -1186,8 +1401,7 @@ def TargetOp : OpenMP_Op<"target",[AttrSizedOperandSegments]> { Optional:$device, Optional:$thread_limit, UnitAttr:$nowait, - Variadic:$map_operands, - OptionalAttr:$map_types); + Variadic:$map_operands); let regions = (region AnyRegion:$region); @@ -1196,7 +1410,7 @@ def TargetOp : OpenMP_Op<"target",[AttrSizedOperandSegments]> { | `device` `(` $device `:` type($device) `)` | `thread_limit` `(` $thread_limit `:` type($thread_limit) `)` | `nowait` $nowait - | `map` `(` custom($map_operands, type($map_operands), $map_types) `)` + | `map_entries` `(` $map_operands `:` type($map_operands) `)` ) $region attr-dict }]; diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td index 6d8aaf64e3263..08b71e20a2bc0 100644 --- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td +++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td @@ -250,17 +250,10 @@ def ForOp : SCF_Op<"for", "expected an index less than the number of region iter args"); return getBody()->getArguments().drop_front(getNumInductionVars())[index]; } - MutableArrayRef getIterOpOperands() { - return - getOperation()->getOpOperands().drop_front(getNumControlOperands()); - } void setLowerBound(Value bound) { getOperation()->setOperand(0, bound); } void setUpperBound(Value bound) { getOperation()->setOperand(1, bound); } void setStep(Value step) { getOperation()->setOperand(2, step); } - void setIterArg(unsigned iterArgNum, Value iterArgValue) { - getOperation()->setOperand(iterArgNum + getNumControlOperands(), iterArgValue); - } /// Number of induction variables, always 1 for scf::ForOp. unsigned getNumInductionVars() { return 1; } @@ -270,16 +263,6 @@ def ForOp : SCF_Op<"for", } /// Number of operands controlling the loop: lb, ub, step unsigned getNumControlOperands() { return 3; } - /// Get the iter arg number for an operand. If it isnt an iter arg - /// operand return std::nullopt. - std::optional getIterArgNumberForOpOperand(OpOperand &opOperand) { - if (opOperand.getOwner() != getOperation()) - return std::nullopt; - unsigned operandNumber = opOperand.getOperandNumber(); - if (operandNumber < getNumControlOperands()) - return std::nullopt; - return operandNumber - getNumControlOperands(); - } /// Get the region iter arg that corresponds to an OpOperand. /// This helper prevents internal op implementation detail leakage to @@ -965,6 +948,7 @@ def ReduceReturnOp : def WhileOp : SCF_Op<"while", [DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, RecursiveMemoryEffects, SingleBlock]> { let summary = "a generic 'while' loop"; let description = [{ diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h index e7bcd062d9652..ca641c596c7b7 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h @@ -26,7 +26,7 @@ namespace mlir { namespace scf { using SCFTileSizeComputationFunction = - std::function(OpBuilder &, Operation *)>; + std::function(OpBuilder &, Operation *)>; /// Options to use to control tiling. struct SCFTilingOptions { @@ -40,17 +40,10 @@ struct SCFTilingOptions { tileSizeComputationFunction = std::move(fun); return *this; } - /// Set the `tileSizeComputationFunction` to return the values `ts`. The - /// values must not fold away when tiling. Otherwise, use a more robust - /// `tileSizeComputationFunction`. - SCFTilingOptions &setTileSizes(const SmallVector &ts) { - tileSizeComputationFunction = [=](OpBuilder &, Operation *) { return ts; }; - return *this; - } /// Convenience function to set the `tileSizeComputationFunction` to a /// function that computes tile sizes at the point they are needed. Allows /// proper interaction with folding. - SCFTilingOptions &setTileSizes(ArrayRef ts); + SCFTilingOptions &setTileSizes(ArrayRef ts); /// The interchange vector to reorder the tiled loops. SmallVector interchangeVector = {}; diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h index 306b3789a044f..82d0e93a8ee2f 100644 --- a/mlir/include/mlir/IR/OpDefinition.h +++ b/mlir/include/mlir/IR/OpDefinition.h @@ -932,10 +932,6 @@ struct SingleBlock : public TraitBase { } template enable_if_single_region insert(Block::iterator insertPt, Operation *op) { - Block *body = getBody(); - // Insert op before the block's terminator if it has one - if (insertPt == body->end() && body->hasTerminator()) - insertPt = Block::iterator(body->getTerminator()); getBody()->getOperations().insert(insertPt, op); } }; diff --git a/mlir/include/mlir/IR/ValueRange.h b/mlir/include/mlir/IR/ValueRange.h index f1a1f1841f179..9c11178f9cd9c 100644 --- a/mlir/include/mlir/IR/ValueRange.h +++ b/mlir/include/mlir/IR/ValueRange.h @@ -165,13 +165,9 @@ class MutableOperandRange { /// Returns the OpOperand at the given index. OpOperand &operator[](unsigned index) const; - OperandRange::iterator begin() const { - return static_cast(*this).begin(); - } - - OperandRange::iterator end() const { - return static_cast(*this).end(); - } + /// Iterators enumerate OpOperands. + MutableArrayRef::iterator begin() const; + MutableArrayRef::iterator end() const; private: /// Update the length of this range to the one provided. diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h index 8a45da7d1b982..5489a13a8040b 100644 --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -26,6 +26,7 @@ #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/Transforms/Passes.h" #include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Dialect/MLProgram/Transforms/Passes.h" #include "mlir/Dialect/Math/Transforms/Passes.h" #include "mlir/Dialect/MemRef/Transforms/Passes.h" #include "mlir/Dialect/NVGPU/Transforms/Passes.h" @@ -72,6 +73,7 @@ inline void registerAllPasses() { LLVM::registerLLVMPasses(); math::registerMathPasses(); memref::registerMemRefPasses(); + ml_program::registerMLProgramPasses(); registerSCFPasses(); registerShapePasses(); spirv::registerSPIRVPasses(); diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.td b/mlir/include/mlir/Interfaces/LoopLikeInterface.td index 9ccc97251e7e6..44d32dd609fc9 100644 --- a/mlir/include/mlir/Interfaces/LoopLikeInterface.td +++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.td @@ -36,15 +36,15 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> { /*args=*/(ins "::mlir::Value ":$value), /*methodBody=*/"", /*defaultImplementation=*/[{ - return value.getParentRegion()->isProperAncestor(&$_op.getLoopBody()); + return !$_op->isAncestor(value.getParentRegion()->getParentOp()); }] >, InterfaceMethod<[{ - Returns the region that makes up the body of the loop and should be + Returns the regions that make up the body of the loop and should be inspected for loop-invariant operations. }], - /*retTy=*/"::mlir::Region &", - /*methodName=*/"getLoopBody" + /*retTy=*/"::llvm::SmallVector<::mlir::Region *>", + /*methodName=*/"getLoopRegions" >, InterfaceMethod<[{ Moves the given loop-invariant operation out of the loop. diff --git a/mlir/include/mlir/Transforms/LoopInvariantCodeMotionUtils.h b/mlir/include/mlir/Transforms/LoopInvariantCodeMotionUtils.h index e54675967d825..c7b816eb28faf 100644 --- a/mlir/include/mlir/Transforms/LoopInvariantCodeMotionUtils.h +++ b/mlir/include/mlir/Transforms/LoopInvariantCodeMotionUtils.h @@ -11,12 +11,13 @@ #include "mlir/Support/LLVM.h" +#include "llvm/ADT/SmallVector.h" + namespace mlir { class LoopLikeOpInterface; class Operation; class Region; -class RegionRange; class Value; /// Given a list of regions, perform loop-invariant code motion. An operation is @@ -61,7 +62,7 @@ class Value; /// /// Returns the number of operations moved. size_t moveLoopInvariantCode( - RegionRange regions, + ArrayRef regions, function_ref isDefinedOutsideRegion, function_ref shouldMoveOutOfRegion, function_ref moveOutOfRegion); diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index b06937bc285e2..504ed8f3eadb0 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -2768,6 +2768,24 @@ void mlir::python::populateIRCore(py::module &m) { return PyOpAttributeMap( self.getOperation().getRef()); }) + .def_property_readonly( + "context", + [](PyOperationBase &self) { + PyOperation &concreteOperation = self.getOperation(); + concreteOperation.checkValid(); + return concreteOperation.getContext().getObject(); + }, + "Context that owns the Operation") + .def_property_readonly("name", + [](PyOperationBase &self) { + auto &concreteOperation = self.getOperation(); + concreteOperation.checkValid(); + MlirOperation operation = + concreteOperation.get(); + MlirStringRef name = mlirIdentifierStr( + mlirOperationGetName(operation)); + return py::str(name.data, name.length); + }) .def_property_readonly("operands", [](PyOperationBase &self) { return PyOpOperandList( @@ -2813,6 +2831,14 @@ void mlir::python::populateIRCore(py::module &m) { }, "Returns the source location the operation was defined or derived " "from.") + .def_property_readonly("parent", + [](PyOperationBase &self) -> py::object { + auto parent = + self.getOperation().getParentOperation(); + if (parent) + return parent->getObject(); + return py::none(); + }) .def( "__str__", [](PyOperationBase &self) { @@ -2855,6 +2881,12 @@ void mlir::python::populateIRCore(py::module &m) { .def("move_before", &PyOperationBase::moveBefore, py::arg("other"), "Puts self immediately before the other operation in its parent " "block.") + .def( + "clone", + [](PyOperationBase &self, py::object ip) { + return self.getOperation().clone(ip); + }, + py::arg("ip") = py::none()) .def( "detach_from_parent", [](PyOperationBase &self) { @@ -2866,7 +2898,8 @@ void mlir::python::populateIRCore(py::module &m) { operation.detachFromParent(); return operation.createOpView(); }, - "Detaches the operation from its parent block."); + "Detaches the operation from its parent block.") + .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); }); py::class_(m, "Operation", py::module_local()) .def_static("create", &PyOperation::create, py::arg("name"), @@ -2887,45 +2920,17 @@ void mlir::python::populateIRCore(py::module &m) { py::arg("context") = py::none(), "Parses an operation. Supports both text assembly format and binary " "bytecode format.") - .def_property_readonly("parent", - [](PyOperation &self) -> py::object { - auto parent = self.getParentOperation(); - if (parent) - return parent->getObject(); - return py::none(); - }) - .def("erase", &PyOperation::erase) - .def("clone", &PyOperation::clone, py::arg("ip") = py::none()) .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, &PyOperation::getCapsule) .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyOperation::createFromCapsule) - .def_property_readonly("name", - [](PyOperation &self) { - self.checkValid(); - MlirOperation operation = self.get(); - MlirStringRef name = mlirIdentifierStr( - mlirOperationGetName(operation)); - return py::str(name.data, name.length); - }) - .def_property_readonly( - "context", - [](PyOperation &self) { - self.checkValid(); - return self.getContext().getObject(); - }, - "Context that owns the Operation") + .def_property_readonly("operation", [](py::object self) { return self; }) .def_property_readonly("opview", &PyOperation::createOpView); auto opViewClass = py::class_(m, "OpView", py::module_local()) .def(py::init(), py::arg("operation")) .def_property_readonly("operation", &PyOpView::getOperationObject) - .def_property_readonly( - "context", - [](PyOpView &self) { - return self.getOperation().getContext().getObject(); - }, - "Context that owns the Operation") + .def_property_readonly("opview", [](py::object self) { return self; }) .def("__str__", [](PyOpView &self) { return py::str(self.getOperationObject()); }); @@ -3430,9 +3435,11 @@ void mlir::python::populateIRCore(py::module &m) { MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate(); if (useLocalScope) mlirOpPrintingFlagsUseLocalScope(flags); - mlirValuePrintAsOperand(self.get(), flags, printAccum.getCallback(), + MlirAsmState state = mlirAsmStateCreateForValue(self.get(), flags); + mlirValuePrintAsOperand(self.get(), state, printAccum.getCallback(), printAccum.getUserData()); mlirOpPrintingFlagsDestroy(flags); + mlirAsmStateDestroy(state); return printAccum.join(); }, py::arg("use_local_scope") = false, kGetNameAsOperand) diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp index ef234a912490e..7f5c2aaee6738 100644 --- a/mlir/lib/CAPI/IR/IR.cpp +++ b/mlir/lib/CAPI/IR/IR.cpp @@ -138,6 +138,51 @@ void mlirDialectRegistryDestroy(MlirDialectRegistry registry) { delete unwrap(registry); } +//===----------------------------------------------------------------------===// +// AsmState API. +//===----------------------------------------------------------------------===// + +MlirAsmState mlirAsmStateCreateForOperation(MlirOperation op, + MlirOpPrintingFlags flags) { + return wrap(new AsmState(unwrap(op), *unwrap(flags))); +} + +static Operation *findParent(Operation *op, bool shouldUseLocalScope) { + do { + // If we are printing local scope, stop at the first operation that is + // isolated from above. + if (shouldUseLocalScope && op->hasTrait()) + break; + + // Otherwise, traverse up to the next parent. + Operation *parentOp = op->getParentOp(); + if (!parentOp) + break; + op = parentOp; + } while (true); + return op; +} + +MlirAsmState mlirAsmStateCreateForValue(MlirValue value, + MlirOpPrintingFlags flags) { + Operation *op; + mlir::Value val = unwrap(value); + if (auto result = llvm::dyn_cast(val)) { + op = result.getOwner(); + } else { + op = llvm::cast(val).getOwner()->getParentOp(); + if (!op) { + emitError(val.getLoc()) << "<>"; + return {nullptr}; + } + } + op = findParent(op, unwrap(flags)->shouldUseLocalScope()); + return wrap(new AsmState(op, *unwrap(flags))); +} + +/// Destroys printing flags created with mlirAsmStateCreate. +void mlirAsmStateDestroy(MlirAsmState state) { delete unwrap(state); } + //===----------------------------------------------------------------------===// // Printing flags API. //===----------------------------------------------------------------------===// @@ -840,11 +885,11 @@ void mlirValuePrint(MlirValue value, MlirStringCallback callback, unwrap(value).print(stream); } -void mlirValuePrintAsOperand(MlirValue value, MlirOpPrintingFlags flags, +void mlirValuePrintAsOperand(MlirValue value, MlirAsmState state, MlirStringCallback callback, void *userData) { detail::CallbackOstream stream(callback, userData); Value cppValue = unwrap(value); - cppValue.printAsOperand(stream, *unwrap(flags)); + cppValue.printAsOperand(stream, *unwrap(state)); } MlirOpOperand mlirValueGetFirstUse(MlirValue value) { diff --git a/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp index d73cd5686d66e..eb7fcb63d920d 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/WmmaOpsToSPIRV.cpp @@ -24,11 +24,17 @@ #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/TypeUtilities.h" +#include "mlir/IR/ValueRange.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringSwitch.h" #include namespace mlir { +//===----------------------------------------------------------------------===// +// Patterns and helpers used by both the KHR and the NV lowering paths. +//===----------------------------------------------------------------------===// + /// Creates a SPIR-V op to replace the given GPU subgroup mma elementwise op /// when the elementwise op directly supports with cooperative matrix type. /// Returns false if cannot. @@ -77,6 +83,119 @@ static bool createElementwiseOp(ConversionPatternRewriter &builder, return false; } +bool allOperandsHaveSameCoopMatrixType(ValueRange operands) { + assert(!operands.empty()); + if (!llvm::all_equal( + llvm::map_range(operands, [](Value v) { return v.getType(); }))) + return false; + + return isa( + operands.front().getType()); +} + +namespace { +/// Converts GPU MMA ConstantMatrixOp to constant SPIR-V KHR/NV cooperative +/// matrix ops. +struct WmmaConstantOpToSPIRVLowering final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(gpu::SubgroupMmaConstantMatrixOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + assert(adaptor.getOperands().size() == 1); + Value cst = adaptor.getOperands().front(); + auto coopType = getTypeConverter()->convertType(op.getType()); + if (!coopType) + return rewriter.notifyMatchFailure(op, "type conversion failed"); + + rewriter.replaceOpWithNewOp(op, coopType, cst); + return success(); + } +}; + +/// Converts elementwise ops to SPIR-V cooperative matrix elementwise ops for +/// the default case. +struct WmmaElementwiseOpToSPIRVDefaultLowering final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(gpu::SubgroupMmaElementwiseOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // All operands should be of cooperative matrix types. + if (!allOperandsHaveSameCoopMatrixType(adaptor.getOperands())) { + return rewriter.notifyMatchFailure(op, + "not all operands are coop matrices"); + } + + auto coopType = getTypeConverter()->convertType(op.getType()); + if (!coopType) + return rewriter.notifyMatchFailure(op, "type conversion failed"); + + return success( + createElementwiseOp(rewriter, op, coopType, adaptor.getOperands())); + } +}; + +/// Converts elementwise ops to SPIR-V cooperative matrix elementwise ops for +/// matrix times scalar case. +struct WmmaElementwiseOpToSPIRVScalarMulLowering final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(gpu::SubgroupMmaElementwiseOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (adaptor.getOperands().size() != 2) + return failure(); + + // All operands should be of cooperative matrix types. + if (!allOperandsHaveSameCoopMatrixType(adaptor.getOperands())) { + return rewriter.notifyMatchFailure(op, + "not all operands are coop matrices"); + } + + if (op.getOpType() != gpu::MMAElementwiseOp::MULF) + return failure(); + + // Use the original operands to check whether one of the operands is a splat + // scalar value. + Value lhs = op.getOperands().front(); + Value rhs = op.getOperands().back(); + Value splat = nullptr; + Value matrix = nullptr; + if (lhs.getDefiningOp()) { + splat = adaptor.getOperands().front(); + matrix = adaptor.getOperands().back(); + } else if (rhs.getDefiningOp()) { + matrix = adaptor.getOperands().front(); + splat = adaptor.getOperands().back(); + } + if (!splat || !matrix) + return rewriter.notifyMatchFailure(op, "no splat operand"); + + // Constant MMA matrix ops are converted to `spirv.CompositeConstruct` ops. + Value scalar; + auto cc = splat.getDefiningOp(); + if (!cc) { + return rewriter.notifyMatchFailure(op, + "splat is not a composite construct"); + } + + assert(cc.getConstituents().size() == 1); + scalar = cc.getConstituents().front(); + + auto coopType = getTypeConverter()->convertType(op.getType()); + if (!coopType) + return rewriter.notifyMatchFailure(op, "type conversion failed"); + rewriter.replaceOpWithNewOp( + op, coopType, ValueRange{matrix, scalar}); + return success(); + } +}; +} // namespace + //===----------------------------------------------------------------------===// // SPV_KHR_cooperative_matrix //===----------------------------------------------------------------------===// @@ -262,100 +381,6 @@ struct WmmaMmaOpToSPIRVLowering final } }; -/// Converts GPU MMA ConstantMatrixOp to constant SPIR-V NV cooperative matrix -/// ops. -struct WmmaConstantOpToSPIRVLowering final - : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(gpu::SubgroupMmaConstantMatrixOp subgroupMmaConstantMatrixOp, - OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Value cst = adaptor.getOperands()[0]; - auto coopType = convertMMAToSPIRVCoopMatrixNVType( - cast(subgroupMmaConstantMatrixOp.getType())); - rewriter.replaceOpWithNewOp( - subgroupMmaConstantMatrixOp, coopType, cst); - return success(); - } -}; - -/// Converts elementwise ops to SPIR-V cooperative matrix elementwise ops for -/// the default case. -struct WmmaElementwiseOpToSPIRVDefaultLowering final - : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(gpu::SubgroupMmaElementwiseOp elementwiseOp, - OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - // All operands should be of cooperative matrix types. - for (Value operand : adaptor.getOperands()) { - if (!isa(operand.getType())) - return failure(); - } - auto coopType = convertMMAToSPIRVCoopMatrixNVType( - cast(elementwiseOp.getType())); - return success(createElementwiseOp(rewriter, elementwiseOp, coopType, - adaptor.getOperands())); - } -}; - -/// Converts elementwise ops to SPIR-V cooperative matrix elementwise ops for -/// matrix times scalar case. -struct WmmaElementwiseOpToSPIRVScalarMulLowering final - : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(gpu::SubgroupMmaElementwiseOp elementwiseOp, - OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (adaptor.getOperands().size() != 2) - return failure(); - // All operands should be of cooperative matrix types. - for (Value operand : adaptor.getOperands()) { - if (!isa(operand.getType())) - return failure(); - } - - if (elementwiseOp.getOpType() != gpu::MMAElementwiseOp::MULF) - return failure(); - - // Use the original operands to check whether one of the operands is a splat - // scalar value. - Value lhs = elementwiseOp.getOperands().front(); - Value rhs = elementwiseOp.getOperands().back(); - Value splat = nullptr; - Value matrix = nullptr; - if (lhs.getDefiningOp()) { - splat = adaptor.getOperands().front(); - matrix = adaptor.getOperands().back(); - } else if (rhs.getDefiningOp()) { - matrix = adaptor.getOperands().front(); - splat = adaptor.getOperands().back(); - } - if (!splat || !matrix) - return failure(); - - // Constant MMA matrix ops are converted to spirv.CompositeConstruct ops. - Value scalar = nullptr; - auto cc = splat.getDefiningOp(); - if (!cc) - return failure(); - assert(cc.getConstituents().size() == 1); - scalar = cc.getConstituents().front(); - - auto coopType = convertMMAToSPIRVCoopMatrixNVType( - cast(elementwiseOp.getType())); - rewriter.replaceOpWithNewOp( - elementwiseOp, coopType, ValueRange{matrix, scalar}); - return success(); - } -}; - } // namespace } // namespace nv } // namespace mlir @@ -389,19 +414,21 @@ void mlir::populateGpuWMMAToSPIRVCoopMatrixKHRConversionPatterns( using namespace mlir; MLIRContext *context = patterns.getContext(); patterns.add(converter, context); + khr::WmmaStoreOpToSPIRVLowering, WmmaConstantOpToSPIRVLowering, + WmmaElementwiseOpToSPIRVDefaultLowering>(converter, context); + // Give the following patterns higher benefit to prevail over the default one. + patterns.add(converter, context, + /*benefit=*/2); } void mlir::populateGpuWMMAToSPIRVCoopMatrixNVConversionPatterns( SPIRVTypeConverter &converter, RewritePatternSet &patterns) { using namespace mlir; MLIRContext *context = patterns.getContext(); - patterns - .add(converter, context); + patterns.add(converter, context); // Give the following patterns higher benefit to prevail over the default one. - patterns.add(converter, - context, - /*benefit=*/2); + patterns.add(converter, context, + /*benefit=*/2); } diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp index adcbbc3f0abb2..b018b82fa5794 100644 --- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp +++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp @@ -207,10 +207,10 @@ void mlir::configureOpenMPToLLVMConversionLegality( typeConverter.isLegal(op->getOperandTypes()) && typeConverter.isLegal(op->getResultTypes()); }); - target.addDynamicallyLegalOp( + target.addDynamicallyLegalOp< + mlir::omp::AtomicReadOp, mlir::omp::AtomicWriteOp, mlir::omp::FlushOp, + mlir::omp::ThreadprivateOp, mlir::omp::YieldOp, mlir::omp::EnterDataOp, + mlir::omp::ExitDataOp, mlir::omp::DataBoundsOp, mlir::omp::MapInfoOp>( [&](Operation *op) { return typeConverter.isLegal(op->getOperandTypes()) && typeConverter.isLegal(op->getResultTypes()); @@ -230,6 +230,12 @@ void mlir::configureOpenMPToLLVMConversionLegality( void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { + // This type is allowed when converting OpenMP to LLVM Dialect, it carries + // bounds information for map clauses and the operation and type are + // discarded on lowering to LLVM-IR from the OpenMP dialect. + converter.addConversion( + [&](omp::DataBoundsType type) -> Type { return type; }); + patterns.add< AtomicReadOpConversion, ReductionOpConversion, ReductionDeclareOpConversion, RegionOpConversion, @@ -246,7 +252,9 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter, RegionLessOpWithVarOperandsConversion, RegionLessOpConversion, RegionLessOpConversion, - RegionLessOpConversion>(converter); + RegionLessOpConversion, + RegionLessOpWithVarOperandsConversion, + RegionLessOpWithVarOperandsConversion>(converter); } namespace { diff --git a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp index 81a6378fc7e49..f6e3053b8ae6a 100644 --- a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp +++ b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp @@ -163,7 +163,7 @@ struct ForOpConversion final : SCFToSPIRVPattern { signatureConverter.remapInput(0, newIndVar); for (unsigned i = 1, e = body->getNumArguments(); i < e; i++) signatureConverter.remapInput(i, header->getArgument(i)); - body = rewriter.applySignatureConversion(&forOp.getLoopBody(), + body = rewriter.applySignatureConversion(&forOp.getRegion(), signatureConverter); // Move the blocks from the forOp into the loopOp. This is the body of the diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp index c8871c945cbe7..f0412648608a6 100644 --- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp +++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp @@ -1103,7 +1103,7 @@ convertBroadcastOp(RewriterBase &rewriter, vector::BroadcastOp op, } // Replace ForOp with a new ForOp with extra operands. The YieldOp is not -// updated and needs to be updated separatly for the loop to be correct. +// updated and needs to be updated separately for the loop to be correct. static scf::ForOp replaceForOpWithNewSignature(RewriterBase &rewriter, scf::ForOp loop, ValueRange newInitArgs) { @@ -1119,9 +1119,8 @@ static scf::ForOp replaceForOpWithNewSignature(RewriterBase &rewriter, operands); newLoop.getBody()->erase(); - newLoop.getLoopBody().getBlocks().splice( - newLoop.getLoopBody().getBlocks().begin(), - loop.getLoopBody().getBlocks()); + newLoop.getRegion().getBlocks().splice( + newLoop.getRegion().getBlocks().begin(), loop.getRegion().getBlocks()); for (Value operand : newInitArgs) newLoop.getBody()->addArgument(operand.getType(), operand.getLoc()); diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 4455cc88e65e5..5899c198b703b 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -2380,8 +2380,7 @@ void AffineForOp::getCanonicalizationPatterns(RewritePatternSet &results, /// induction variable. AffineForOp only has one region, so zero is the only /// valid value for `index`. OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) { - assert((point.isParent() || point == getLoopBody()) && - "invalid region point"); + assert((point.isParent() || point == getRegion()) && "invalid region point"); // The initial operands map to the loop arguments after the induction // variable or are forwarded to the results when the trip count is zero. @@ -2395,8 +2394,7 @@ OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) { /// not a constant. void AffineForOp::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl ®ions) { - assert((point.isParent() || point == getLoopBody()) && - "expected loop region"); + assert((point.isParent() || point == getRegion()) && "expected loop region"); // The loop may typically branch back to its body or to the parent operation. // If the predecessor is the parent op and the trip count is known to be at // least one, branch into the body using the iterator arguments. And in cases @@ -2404,7 +2402,7 @@ void AffineForOp::getSuccessorRegions( std::optional tripCount = getTrivialConstantTripCount(*this); if (point.isParent() && tripCount.has_value()) { if (tripCount.value() > 0) { - regions.push_back(RegionSuccessor(&getLoopBody(), getRegionIterArgs())); + regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); return; } if (tripCount.value() == 0) { @@ -2422,7 +2420,7 @@ void AffineForOp::getSuccessorRegions( // In all other cases, the loop may branch back to itself or the parent // operation. - regions.push_back(RegionSuccessor(&getLoopBody(), getRegionIterArgs())); + regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); regions.push_back(RegionSuccessor(getResults())); } @@ -2561,7 +2559,7 @@ bool AffineForOp::matchingBoundOperandList() { return true; } -Region &AffineForOp::getLoopBody() { return getRegion(); } +SmallVector AffineForOp::getLoopRegions() { return {&getRegion()}; } std::optional AffineForOp::getSingleInductionVar() { return getInductionVar(); @@ -2758,9 +2756,9 @@ AffineForOp mlir::affine::replaceForOpWithNewYields(OpBuilder &b, b.create(loop.getLoc(), lbOperands, lbMap, ubOperands, ubMap, loop.getStep(), operands); // Take the body of the original parent loop. - newLoop.getLoopBody().takeBody(loop.getLoopBody()); + newLoop.getRegion().takeBody(loop.getRegion()); for (Value val : newIterArgs) - newLoop.getLoopBody().addArgument(val.getType(), val.getLoc()); + newLoop.getRegion().addArgument(val.getType(), val.getLoc()); // Update yield operation with new values to be added. if (!newYieldedValues.empty()) { @@ -3848,7 +3846,9 @@ void AffineParallelOp::build(OpBuilder &builder, OperationState &result, ensureTerminator(*bodyRegion, builder, result.location); } -Region &AffineParallelOp::getLoopBody() { return getRegion(); } +SmallVector AffineParallelOp::getLoopRegions() { + return {&getRegion()}; +} unsigned AffineParallelOp::getNumDims() { return getSteps().size(); } diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp index c9b7f25c545cd..6b3776533bed4 100644 --- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp @@ -85,11 +85,11 @@ static bool isOpLoopInvariant(Operation &op, Value indVar, ValueRange iterArgs, opsToHoist)) return false; } else if (auto forOp = dyn_cast(op)) { - if (!areAllOpsInTheBlockListInvariant(forOp.getLoopBody(), indVar, iterArgs, + if (!areAllOpsInTheBlockListInvariant(forOp.getRegion(), indVar, iterArgs, opsWithUsers, opsToHoist)) return false; } else if (auto parOp = dyn_cast(op)) { - if (!areAllOpsInTheBlockListInvariant(parOp.getLoopBody(), indVar, iterArgs, + if (!areAllOpsInTheBlockListInvariant(parOp.getRegion(), indVar, iterArgs, opsWithUsers, opsToHoist)) return false; } else if (!isMemoryEffectFree(&op) && diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp index 7bd25dbbca636..12a28c2e23b22 100644 --- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp +++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp @@ -429,7 +429,7 @@ static ParallelComputeFunction createParallelComputeFunction( mapping.map(op.getInductionVars(), computeBlockInductionVars); mapping.map(computeFuncType.captures, captures); - for (auto &bodyOp : op.getLoopBody().getOps()) + for (auto &bodyOp : op.getRegion().getOps()) b.clone(bodyOp, mapping); }; }; @@ -732,7 +732,7 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op, // Make sure that all constants will be inside the parallel operation body to // reduce the number of parallel compute function arguments. - cloneConstantsIntoTheRegion(op.getLoopBody(), rewriter); + cloneConstantsIntoTheRegion(op.getRegion(), rewriter); // Compute trip count for each loop induction variable: // tripCount = ceil_div(upperBound - lowerBound, step); diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp index 43ba11cf132cb..09d3083582808 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp @@ -47,6 +47,10 @@ static Value buildBoolValue(OpBuilder &builder, Location loc, bool value) { static bool isMemref(Value v) { return v.getType().isa(); } +static bool isMemrefOperand(OpOperand &operand) { + return isMemref(operand.get()); +} + //===----------------------------------------------------------------------===// // Backedges analysis //===----------------------------------------------------------------------===// @@ -937,7 +941,7 @@ BufferDeallocation::handleInterface(RegionBranchTerminatorOpInterface op) { // Add an additional operand for every MemRef for the ownership indicator. if (!funcWithoutDynamicOwnership) { - unsigned numMemRefs = llvm::count_if(operands, isMemref); + unsigned numMemRefs = llvm::count_if(operands, isMemrefOperand); SmallVector newOperands{OperandRange(operands)}; auto ownershipValues = deallocOp.getUpdatedConditions().take_front(numMemRefs); diff --git a/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp b/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp index e847e946eef1b..9423af2542690 100644 --- a/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/ControlFlow/Transforms/BufferDeallocationOpInterfaceImpl.cpp @@ -96,12 +96,12 @@ struct CondBranchOpInterface mapping[retained] = ownership; } SmallVector replacements, ownerships; - for (Value operand : destOperands) { - replacements.push_back(operand); - if (isMemref(operand)) { - assert(mapping.contains(operand) && + for (OpOperand &operand : destOperands) { + replacements.push_back(operand.get()); + if (isMemref(operand.get())) { + assert(mapping.contains(operand.get()) && "Should be contained at this point"); - ownerships.push_back(mapping[operand]); + ownerships.push_back(mapping[operand.get()]); } } replacements.append(ownerships); diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index de4270ab38004..1819ca614a060 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -473,7 +473,9 @@ transform::FuseOp::apply(transform::TransformRewriter &rewriter, scf::SCFTilingOptions tilingOptions; tilingOptions.interchangeVector = tileInterchange; - tilingOptions = tilingOptions.setTileSizes(tileSizes); + SmallVector tileSizesOfr = + getAsIndexOpFoldResult(rewriter.getContext(), tileSizes); + tilingOptions = tilingOptions.setTileSizes(tileSizesOfr); scf::SCFTileAndFuseOptions tileAndFuseOptions; tileAndFuseOptions.tilingOptions = tilingOptions; LogicalResult result = applyTilingToAll( @@ -923,7 +925,7 @@ transform::FuseIntoContainingOp::apply(transform::TransformRewriter &rewriter, auto nextProducer = getNextProducer(); if (failed(nextProducer)) { auto diag = mlir::emitSilenceableFailure(getLoc()) - << "could not find next producer to fuse into container"; + << "could not find next producer to fuse into container"; diag.attachNote(containingOp->getLoc()) << "containing op"; return diag; } @@ -1999,7 +2001,7 @@ transform::ScalarizeOp::applyToOne(transform::TransformRewriter &rewriter, transform::TransformState &state) { scf::SCFTilingOptions tilingOptions; tilingOptions.setTileSizeComputationFunction([&](OpBuilder &b, Operation *) { - SmallVector tileSizes; + SmallVector tileSizes; Location loc = target.getLoc(); SmallVector allShapeSizes = target.createFlatListOfOperandDims(b, loc); @@ -2012,9 +2014,8 @@ transform::ScalarizeOp::applyToOne(transform::TransformRewriter &rewriter, // If the shape size is dynamic, tile by 1. // Otherwise, do not tile (i.e. tile size 0). for (OpFoldResult shapeSize : shapeSizes) { - tileSizes.push_back(getConstantIntValue(shapeSize) - ? b.create(loc, 0) - : b.create(loc, 1)); + tileSizes.push_back(getConstantIntValue(shapeSize) ? b.getIndexAttr(0) + : b.getIndexAttr(1)); } return tileSizes; }); @@ -2549,7 +2550,7 @@ transform::TileOp::apply(transform::TransformRewriter &rewriter, if (!tileSizes.empty()) { tilingOptions.setTileSizeComputationFunction([&, index = i](OpBuilder &b, Operation *) { - SmallVector sizes; + SmallVector sizes; sizes.reserve(tileSizes.size()); unsigned dynamicIdx = 0; @@ -2560,10 +2561,10 @@ transform::TileOp::apply(transform::TransformRewriter &rewriter, getLoc(), attr.cast().getInt()); Value vscale = b.create(getLoc(), b.getIndexType()); - sizes.push_back(b.create(getLoc(), val, vscale)); + sizes.push_back( + b.create(getLoc(), val, vscale).getResult()); } else { - sizes.push_back(b.create( - getLoc(), cast(attr).getInt())); + sizes.push_back(attr); } continue; } @@ -2573,8 +2574,7 @@ transform::TileOp::apply(transform::TransformRewriter &rewriter, assert((dynamicSizes.empty() ^ params.empty()) && "expected either dynamic sizes or parameters"); if (!params.empty()) { - sizes.push_back( - b.create(getLoc(), params[index])); + sizes.push_back(b.getIndexAttr(params[index])); } else { sizes.push_back(dynamicSizes[index]->getResult(0)); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp index 21bc0554e7176..a9debb7bbc489 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp @@ -810,13 +810,9 @@ padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting, OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPointAfter(hoistedPackedTensor.getDefiningOp()); - std::optional maybeOperandNumber = - forOp.getIterArgNumberForOpOperand(*pUse); - assert(maybeOperandNumber.has_value() && "expected a proper iter arg number"); - - int64_t operandNumber = maybeOperandNumber.value(); + unsigned iterArgNumber = forOp.getResultForOpOperand(*pUse).getResultNumber(); auto yieldOp = cast(forOp.getBody(0)->getTerminator()); - auto yieldingExtractSliceOp = yieldOp->getOperand(operandNumber) + auto yieldingExtractSliceOp = yieldOp->getOperand(iterArgNumber) .getDefiningOp(); if (!yieldingExtractSliceOp) return tensor::ExtractSliceOp(); @@ -829,9 +825,9 @@ padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting, return tensor::ExtractSliceOp(); SmallVector initArgs = forOp.getInitArgs(); - initArgs[operandNumber] = hoistedPackedTensor; + initArgs[iterArgNumber] = hoistedPackedTensor; SmallVector yieldOperands = yieldOp.getOperands(); - yieldOperands[operandNumber] = yieldingExtractSliceOp.getSource(); + yieldOperands[iterArgNumber] = yieldingExtractSliceOp.getSource(); int64_t numOriginalForOpResults = initArgs.size(); LLVM_DEBUG(DBGS() << "numOriginalForOpResults: " << numOriginalForOpResults @@ -844,7 +840,7 @@ padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting, hoistedPackedTensor.getLoc(), hoistedPackedTensor, outerSliceOp.getMixedOffsets(), outerSliceOp.getMixedSizes(), outerSliceOp.getMixedStrides()); - rewriter.replaceAllUsesWith(forOp.getResult(operandNumber), extracted); + rewriter.replaceAllUsesWith(forOp.getResult(iterArgNumber), extracted); } scf::ForOp newForOp = replaceLoopWithNewYields(rewriter, forOp, initArgs, yieldOperands); @@ -853,20 +849,20 @@ padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting, << "\n"); LLVM_DEBUG(DBGS() << "replace source of: " << extracted << "\n"); LLVM_DEBUG(DBGS() << "with result #" - << numOriginalForOpResults + operandNumber + << numOriginalForOpResults + iterArgNumber << " of forOp, giving us: " << extracted << "\n"); rewriter.startRootUpdate(extracted); extracted.getSourceMutable().assign( - newForOp.getResult(numOriginalForOpResults + operandNumber)); + newForOp.getResult(numOriginalForOpResults + iterArgNumber)); rewriter.finalizeRootUpdate(extracted); LLVM_DEBUG(DBGS() << "replace uses of: " << paddedValueBeforeHoisting << "\n"); LLVM_DEBUG(DBGS() << "with region iter arg #" - << numOriginalForOpResults + operandNumber << "\n"); + << numOriginalForOpResults + iterArgNumber << "\n"); rewriter.replaceAllUsesWith( paddedValueBeforeHoisting, - newForOp.getRegionIterArg(numOriginalForOpResults + operandNumber)); + newForOp.getRegionIterArg(numOriginalForOpResults + iterArgNumber)); return extracted; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp index 2ee21099cfb14..7c6639304d97c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -219,8 +219,7 @@ void mlir::linalg::hoistRedundantVectorTransfers(func::FuncOp func) { // Replace all uses of the `transferRead` with the corresponding // basic block argument. transferRead.getVector().replaceUsesWithIf( - newForOp.getLoopBody().getArguments().back(), - [&](OpOperand &use) { + newForOp.getBody()->getArguments().back(), [&](OpOperand &use) { Operation *user = use.getOwner(); return newForOp->isProperAncestor(user); }); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp index 5a44a85c95c75..72b684aaa864c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp @@ -199,9 +199,9 @@ static void replaceIndexOpsByInductionVariables(RewriterBase &rewriter, // Replace the index operations in the body of the innermost loop op. if (!loopOps.empty()) { auto loopOp = cast(loopOps.back()); - for (IndexOp indexOp : - llvm::make_early_inc_range(loopOp.getLoopBody().getOps())) - rewriter.replaceOp(indexOp, allIvs[indexOp.getDim()]); + for (Region *r : loopOp.getLoopRegions()) + for (IndexOp indexOp : llvm::make_early_inc_range(r->getOps())) + rewriter.replaceOp(indexOp, allIvs[indexOp.getDim()]); } } diff --git a/mlir/lib/Dialect/Linalg/Transforms/SubsetHoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/SubsetHoisting.cpp index f4556787668d4..7ab4ea41a2cd8 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/SubsetHoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/SubsetHoisting.cpp @@ -303,7 +303,7 @@ static Operation *isTensorChunkAccessedByUnknownOp(Operation *writeOp, // pass-through tensor arguments left from previous level of // hoisting. if (auto forUser = dyn_cast(user)) { - Value arg = forUser.getLoopBody().getArgument( + Value arg = forUser.getBody()->getArgument( use.getOperandNumber() - forUser.getNumControlOperands() + /*iv value*/ 1); uses.push_back(arg.getUses()); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index f873bd0e0b68e..51a83c35e4cda 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -924,7 +924,6 @@ getTensorExtractMemoryAccessPattern(tensor::ExtractOp extractOp, targetShape.back() == 1) return VectorMemoryAccessKind::Gather; - // 2. Assume that it's a gather load when reading _from_ a tensor for which // the trailing dimension is 1, e.g. `tensor<1x4x1xi32>`. // TODO: Relax this condition. @@ -1484,6 +1483,10 @@ static LogicalResult vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op) { static LogicalResult isValidMaskedInputVector(ArrayRef shape, ArrayRef inputVectorSizes) { + LDBG("Iteration space static sizes:"); + LLVM_DEBUG(llvm::interleaveComma(shape, llvm::dbgs())); + LLVM_DEBUG(llvm::dbgs() << "\n"); + if (inputVectorSizes.size() != shape.size()) { LDBG("Input vector sizes don't match the number of loops"); return failure(); diff --git a/mlir/lib/Dialect/MLProgram/CMakeLists.txt b/mlir/lib/Dialect/MLProgram/CMakeLists.txt index f33061b2d87cf..9f57627c321fb 100644 --- a/mlir/lib/Dialect/MLProgram/CMakeLists.txt +++ b/mlir/lib/Dialect/MLProgram/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(IR) +add_subdirectory(Transforms) diff --git a/mlir/lib/Dialect/MLProgram/IR/MLProgramOps.cpp b/mlir/lib/Dialect/MLProgram/IR/MLProgramOps.cpp index f8f7549566039..5352b7b0454fd 100644 --- a/mlir/lib/Dialect/MLProgram/IR/MLProgramOps.cpp +++ b/mlir/lib/Dialect/MLProgram/IR/MLProgramOps.cpp @@ -178,8 +178,14 @@ LogicalResult GlobalOp::verify() { //===----------------------------------------------------------------------===// GlobalOp GlobalLoadOp::getGlobalOp(SymbolTableCollection &symbolTable) { - return symbolTable.lookupNearestSymbolFrom( - getOperation()->getParentOp(), getGlobalAttr()); + for (auto parent = getOperation()->getParentOp(); parent; + parent = parent->getParentOp()) { + if (auto nearest = symbolTable.lookupNearestSymbolFrom( + parent, getGlobalAttr())) { + return nearest; + } + } + return {}; } LogicalResult @@ -253,8 +259,14 @@ GlobalLoadGraphOp::verifySymbolUses(SymbolTableCollection &symbolTable) { //===----------------------------------------------------------------------===// GlobalOp GlobalStoreOp::getGlobalOp(SymbolTableCollection &symbolTable) { - return symbolTable.lookupNearestSymbolFrom( - getOperation()->getParentOp(), getGlobalAttr()); + for (auto parent = getOperation()->getParentOp(); parent;) { + if (auto nearest = symbolTable.lookupNearestSymbolFrom( + parent, getGlobalAttr())) { + return nearest; + } + parent = parent->getParentOp(); + } + return {}; } LogicalResult diff --git a/mlir/lib/Dialect/MLProgram/Transforms/CMakeLists.txt b/mlir/lib/Dialect/MLProgram/Transforms/CMakeLists.txt new file mode 100644 index 0000000000000..db567b62e0e74 --- /dev/null +++ b/mlir/lib/Dialect/MLProgram/Transforms/CMakeLists.txt @@ -0,0 +1,14 @@ +add_mlir_dialect_library(MLIRMLProgramTransforms + PipelineGlobalOps.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/MLProgram/Transforms + + DEPENDS + MLIRMLProgramPassIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRMLProgramDialect + MLIRPass +) diff --git a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp new file mode 100644 index 0000000000000..7e00a731f6d73 --- /dev/null +++ b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp @@ -0,0 +1,234 @@ +//===- PipelineGlobalOpsPass.cpp - Pipeline Global Ops Pass ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/MLProgram/Transforms/Passes.h" + +#include "mlir/Dialect/MLProgram/IR/MLProgram.h" +#include "mlir/Dialect/MLProgram/Transforms/Passes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir { +namespace ml_program { +#define GEN_PASS_DEF_MLPROGRAMPIPELINEGLOBALS +#include "mlir/Dialect/MLProgram/Transforms/Passes.h.inc" + +namespace { + +class MLProgramPipelineGlobals + : public impl::MLProgramPipelineGlobalsBase { +public: + void runOnOperation() override; + +private: + LogicalResult buildGlobalMap(ModuleOp op); + + void ProcessBlock(Block &block, llvm::DenseSet &symbolLoad, + llvm::DenseSet &symbolStore); + + llvm::DenseMap> loadSymbolsMap; + llvm::DenseMap> storeSymbolsMap; +}; + +// Traverses upwards searchign for the operation mapped by the symbol. +static Operation *getFromSymbol(Operation *baseOp, SymbolRefAttr symbol) { + for (auto op = baseOp; op; op = op->getParentOp()) { + auto lookup = SymbolTable::lookupNearestSymbolFrom(op, symbol); + if (lookup) + return lookup; + } + return nullptr; +} + +// Builds map from a symbol to MLProgram global symbols loaded or stored +// during processing. +LogicalResult MLProgramPipelineGlobals::buildGlobalMap(ModuleOp module) { + llvm::DenseMap callableMap; + auto res = module->walk([&](Operation *op) { + if (auto caller = mlir::dyn_cast(op)) { + auto callable = caller.getCallableForCallee(); + // For now we do not know how to handle Value based tracing, so fail. + if (mlir::isa(callable)) { + return WalkResult::interrupt(); + } + + auto symbol = mlir::dyn_cast(callable); + auto func = getFromSymbol(op, symbol); + callableMap[symbol] = func; + } + return WalkResult::advance(); + }); + + if (res.wasInterrupted()) { + return failure(); + } + + // First grab all symbols loaded or stored by each function. This + // will not handle calls initially. + llvm::DenseMap> opLoadSymbols; + llvm::DenseMap> opStoreSymbols; + for (auto callable : callableMap) { + llvm::DenseSet loadSymbols; + llvm::DenseSet storeSymbols; + + callable.getSecond()->walk( + [&](GlobalLoadOp op) { loadSymbols.insert(op.getGlobal()); }); + + callable.getSecond()->walk( + [&](GlobalStoreOp op) { storeSymbols.insert(op.getGlobal()); }); + + opLoadSymbols[callable.getFirst()] = std::move(loadSymbols); + opStoreSymbols[callable.getFirst()] = std::move(storeSymbols); + } + + // For each callable function we find each global loaded/stored within the + // function or a nested called function. This includes recursion checking to + // avoid infinitely recursing. + for (auto callable : callableMap) { + SymbolRefAttr thisSymbol = llvm::dyn_cast(callable.first); + llvm::SmallVector work = {thisSymbol}; + llvm::DenseSet visited = {thisSymbol}; + llvm::DenseSet loadSymbols; + llvm::DenseSet storeSymbols; + + for (size_t i = 0; i < work.size(); ++i) { + callableMap[work[i]]->walk([&](CallOpInterface call) { + auto symbol = dyn_cast(call.getCallableForCallee()); + if (!visited.contains(symbol)) { + visited.insert(symbol); + work.push_back(symbol); + } + }); + + for (auto load : opLoadSymbols[work[i]]) + loadSymbols.insert(load); + + for (auto store : opStoreSymbols[work[i]]) + storeSymbols.insert(store); + } + + loadSymbolsMap[thisSymbol] = std::move(loadSymbols); + storeSymbolsMap[thisSymbol] = std::move(storeSymbols); + } + + return success(); +} + +// Process each operation in the block deleting unneeded loads / stores, +// recursing on subblocks and checking function calls. +void MLProgramPipelineGlobals::ProcessBlock( + Block &block, llvm::DenseSet &symbolLoad, + llvm::DenseSet &symbolStore) { + + llvm::DenseMap previousLoads; + llvm::DenseMap previousStores; + llvm::SmallVector toDelete; + for (auto &op : block) { + // If this is a global load, remap to a previous value if known + // and delete this load. Remember that this value is the currently + // known load. + if (auto load = mlir::dyn_cast(op)) { + auto ref = load.getGlobal(); + symbolLoad.insert(ref); + if (previousLoads.contains(ref)) { + toDelete.push_back(&op); + load.getResult().replaceAllUsesWith(previousLoads[ref]); + } else { + previousLoads[ref] = load.getResult(); + } + continue; + } + + // Delete a previous store if it exists and is not needed, update + // the most recent known value for this global ref. + if (auto store = mlir::dyn_cast(op)) { + auto ref = store.getGlobal(); + symbolStore.insert(ref); + if (previousStores.contains(ref)) { + toDelete.push_back(previousStores.find(ref)->getSecond()); + } + + previousLoads[ref] = store.getValue(); + previousStores[ref] = &op; + continue; + } + + // If a function is called, clear known values for loads/stores used by + // the function or its sub-functions. + if (auto call = mlir::dyn_cast(op)) { + auto loadSymbols = + loadSymbolsMap[dyn_cast(call.getCallableForCallee())]; + auto storeSymbols = + storeSymbolsMap[dyn_cast(call.getCallableForCallee())]; + + for (auto sym : loadSymbols) { + previousStores.erase(sym); + } + + for (auto sym : storeSymbols) { + previousLoads.erase(sym); + previousStores.erase(sym); + } + continue; + } + + // If the op has sub-regions, recurse inside. We make no guarantees whether + // the recursion occurs. + llvm::DenseSet opSymbolLoad; + llvm::DenseSet opSymbolStore; + for (auto ®ion : op.getRegions()) { + for (auto &block : region) { + ProcessBlock(block, opSymbolLoad, opSymbolStore); + } + } + + // Update current state from the subblock. + for (auto change : opSymbolLoad) { + symbolLoad.insert(change); + previousStores.erase(change); + } + + for (auto change : opSymbolStore) { + symbolStore.insert(change); + previousLoads.erase(change); + previousStores.erase(change); + } + } + + for (auto op : toDelete) { + op->erase(); + } +} + +void MLProgramPipelineGlobals::runOnOperation() { + auto targetOp = getOperation(); + if (failed(buildGlobalMap(targetOp))) { + return; + } + + for (auto &funcOp : *targetOp.getBody()) { + for (auto ®ion : funcOp.getRegions()) { + for (auto &block : region.getBlocks()) { + llvm::DenseSet symbolsLoaded; + llvm::DenseSet symbolsStored; + ProcessBlock(block, symbolsLoaded, symbolsStored); + } + } + } +} + +} // namespace + +std::unique_ptr> +createMLProgramPipelineGlobalsPass() { + return std::make_unique(); +} + +} // namespace ml_program +} // namespace mlir diff --git a/mlir/lib/Dialect/MemRef/Transforms/MultiBuffer.cpp b/mlir/lib/Dialect/MemRef/Transforms/MultiBuffer.cpp index eb1df2a87b99a..397bd5856bcb0 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/MultiBuffer.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/MultiBuffer.cpp @@ -152,8 +152,9 @@ mlir::memref::multiBuffer(RewriterBase &rewriter, memref::AllocOp allocOp, std::optional inductionVar = candidateLoop.getSingleInductionVar(); std::optional lowerBound = candidateLoop.getSingleLowerBound(); std::optional singleStep = candidateLoop.getSingleStep(); - if (!inductionVar || !lowerBound || !singleStep) { - LLVM_DEBUG(DBGS() << "Skip alloc: no single iv, lb or step\n"); + if (!inductionVar || !lowerBound || !singleStep || + !llvm::hasSingleElement(candidateLoop.getLoopRegions())) { + LLVM_DEBUG(DBGS() << "Skip alloc: no single iv, lb, step or region\n"); return failure(); } @@ -184,7 +185,8 @@ mlir::memref::multiBuffer(RewriterBase &rewriter, memref::AllocOp allocOp, // 3. Within the loop, build the modular leading index (i.e. each loop // iteration %iv accesses slice ((%iv - %lb) / %step) % %mb_factor). - rewriter.setInsertionPointToStart(&candidateLoop.getLoopBody().front()); + rewriter.setInsertionPointToStart( + &candidateLoop.getLoopRegions().front()->front()); Value ivVal = *inductionVar; Value lbVal = getValueOrCreateConstantIndexOp(rewriter, loc, *lowerBound); Value stepVal = getValueOrCreateConstantIndexOp(rewriter, loc, *singleStep); diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 18da93bc1a342..2bf9355ed6267 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -66,6 +66,10 @@ void OpenMPDialect::initialize() { #define GET_ATTRDEF_LIST #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc" >(); + addTypes< +#define GET_TYPEDEF_LIST +#include "mlir/Dialect/OpenMP/OpenMPOpsTypes.cpp.inc" + >(); addInterface(); LLVM::LLVMPointerType::attachInterface< @@ -660,187 +664,196 @@ static LogicalResult verifySynchronizationHint(Operation *op, uint64_t hint) { //===----------------------------------------------------------------------===// // Parser, printer and verifier for Target //===----------------------------------------------------------------------===// -/// Parses a Map Clause. -/// -/// map-clause = `map (` ( `(` `always, `? `close, `? `present, `? ( `to` | -/// `from` | `delete` ) ` -> ` symbol-ref ` : ` type(symbol-ref) `)` )+ `)` -/// Eg: map((release -> %1 : !llvm.ptr>), (always, close, from -/// -> %2 : !llvm.ptr>)) -static ParseResult -parseMapClause(OpAsmParser &parser, - SmallVectorImpl &map_operands, - SmallVectorImpl &map_operand_types, ArrayAttr &map_types) { - StringRef mapTypeMod; - OpAsmParser::UnresolvedOperand arg1; - Type arg1Type; - IntegerAttr arg2; - SmallVector mapTypesVec; - llvm::omp::OpenMPOffloadMappingFlags mapTypeBits; +// Helper function to get bitwise AND of `value` and 'flag' +uint64_t mapTypeToBitFlag(uint64_t value, + llvm::omp::OpenMPOffloadMappingFlags flag) { + return value & + static_cast< + std::underlying_type_t>( + flag); +} + +/// Parses a map_entries map type from a string format back into its numeric +/// value. +/// +/// map-clause = `map_clauses ( ( `(` `always, `? `close, `? `present, `? ( +/// `to` | `from` | `delete` `)` )+ `)` ) +static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) { + llvm::omp::OpenMPOffloadMappingFlags mapTypeBits = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; + + // This simply verifies the correct keyword is read in, the + // keyword itself is stored inside of the operation auto parseTypeAndMod = [&]() -> ParseResult { + StringRef mapTypeMod; if (parser.parseKeyword(&mapTypeMod)) return failure(); if (mapTypeMod == "always") mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS; + if (mapTypeMod == "close") mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE; + if (mapTypeMod == "present") mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT; if (mapTypeMod == "to") mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + if (mapTypeMod == "from") mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + if (mapTypeMod == "tofrom") mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO | llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; + if (mapTypeMod == "delete") mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE; - return success(); - }; - - auto parseMap = [&]() -> ParseResult { - mapTypeBits = llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE; - if (parser.parseLParen() || - parser.parseCommaSeparatedList(parseTypeAndMod) || - parser.parseArrow() || parser.parseOperand(arg1) || - parser.parseColon() || parser.parseType(arg1Type) || - parser.parseRParen()) - return failure(); - map_operands.push_back(arg1); - map_operand_types.push_back(arg1Type); - arg2 = parser.getBuilder().getIntegerAttr( - parser.getBuilder().getI64Type(), - static_cast< - std::underlying_type_t>( - mapTypeBits)); - mapTypesVec.push_back(arg2); return success(); }; - if (parser.parseCommaSeparatedList(parseMap)) + if (parser.parseCommaSeparatedList(parseTypeAndMod)) return failure(); - SmallVector mapTypesAttr(mapTypesVec.begin(), mapTypesVec.end()); - map_types = ArrayAttr::get(parser.getContext(), mapTypesAttr); + mapType = parser.getBuilder().getIntegerAttr( + parser.getBuilder().getIntegerType(64, /*isSigned=*/false), + static_cast>( + mapTypeBits)); + return success(); } +/// Prints a map_entries map type from its numeric value out into its string +/// format. static void printMapClause(OpAsmPrinter &p, Operation *op, - OperandRange map_operands, - TypeRange map_operand_types, ArrayAttr map_types) { - - // Helper function to get bitwise AND of `value` and 'flag' - auto bitAnd = [](int64_t value, - llvm::omp::OpenMPOffloadMappingFlags flag) -> bool { - return value & - static_cast< - std::underlying_type_t>( - flag); - }; + IntegerAttr mapType) { + uint64_t mapTypeBits = mapType.getUInt(); + + bool emitAllocRelease = true; + llvm::SmallVector mapTypeStrs; + + // handling of always, close, present placed at the beginning of the string + // to aid readability + if (mapTypeToBitFlag(mapTypeBits, + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS)) + mapTypeStrs.push_back("always"); + if (mapTypeToBitFlag(mapTypeBits, + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE)) + mapTypeStrs.push_back("close"); + if (mapTypeToBitFlag(mapTypeBits, + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) + mapTypeStrs.push_back("present"); + + // special handling of to/from/tofrom/delete and release/alloc, release + + // alloc are the abscense of one of the other flags, whereas tofrom requires + // both the to and from flag to be set. + bool to = mapTypeToBitFlag(mapTypeBits, + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); + bool from = mapTypeToBitFlag( + mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); + if (to && from) { + emitAllocRelease = false; + mapTypeStrs.push_back("tofrom"); + } else if (from) { + emitAllocRelease = false; + mapTypeStrs.push_back("from"); + } else if (to) { + emitAllocRelease = false; + mapTypeStrs.push_back("to"); + } + if (mapTypeToBitFlag(mapTypeBits, + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE)) { + emitAllocRelease = false; + mapTypeStrs.push_back("delete"); + } + if (emitAllocRelease) + mapTypeStrs.push_back("exit_release_or_enter_alloc"); - assert(map_operands.size() == map_types.size()); - - for (unsigned i = 0, e = map_operands.size(); i < e; i++) { - int64_t mapTypeBits = 0x00; - Value mapOp = map_operands[i]; - Attribute mapTypeOp = map_types[i]; - - assert(llvm::isa(mapTypeOp)); - mapTypeBits = llvm::cast(mapTypeOp).getInt(); - - bool always = bitAnd(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS); - bool close = bitAnd(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE); - bool present = bitAnd( - mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT); - - bool to = - bitAnd(mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); - bool from = - bitAnd(mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); - bool del = bitAnd(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE); - - std::string typeModStr, typeStr; - llvm::raw_string_ostream typeMod(typeModStr), type(typeStr); - - if (always) - typeMod << "always, "; - if (close) - typeMod << "close, "; - if (present) - typeMod << "present, "; - - if (to) - type << "to"; - if (from) - type << "from"; - if (del) - type << "delete"; - if (type.str().empty()) - type << (isa(op) ? "release" : "alloc"); - - p << '(' << typeMod.str() << type.str() << " -> " << mapOp << " : " - << mapOp.getType() << ')'; - if (i + 1 < e) + for (unsigned int i = 0; i < mapTypeStrs.size(); ++i) { + p << mapTypeStrs[i]; + if (i + 1 < mapTypeStrs.size()) { p << ", "; + } } } -static LogicalResult verifyMapClause(Operation *op, OperandRange map_operands, - std::optional map_types) { - // Helper function to get bitwise AND of `value` and 'flag' - auto bitAnd = [](int64_t value, - llvm::omp::OpenMPOffloadMappingFlags flag) -> bool { - return value & - static_cast< - std::underlying_type_t>( - flag); - }; - if (!map_types) { - if (!map_operands.empty()) - return emitError(op->getLoc(), "missing mapTypes"); - else - return success(); - } +static void printCaptureType(OpAsmPrinter &p, Operation *op, + VariableCaptureKindAttr mapCaptureType) { + std::string typeCapStr; + llvm::raw_string_ostream typeCap(typeCapStr); + if (mapCaptureType.getValue() == mlir::omp::VariableCaptureKind::ByRef) + typeCap << "ByRef"; + if (mapCaptureType.getValue() == mlir::omp::VariableCaptureKind::ByCopy) + typeCap << "ByCopy"; + if (mapCaptureType.getValue() == mlir::omp::VariableCaptureKind::VLAType) + typeCap << "VLAType"; + if (mapCaptureType.getValue() == mlir::omp::VariableCaptureKind::This) + typeCap << "This"; + p << typeCap.str(); +} + +static ParseResult parseCaptureType(OpAsmParser &parser, + VariableCaptureKindAttr &mapCapture) { + StringRef mapCaptureKey; + if (parser.parseKeyword(&mapCaptureKey)) + return failure(); - if (map_operands.empty() && !map_types->empty()) - return emitError(op->getLoc(), "missing mapOperands"); + if (mapCaptureKey == "This") + mapCapture = mlir::omp::VariableCaptureKindAttr::get( + parser.getContext(), mlir::omp::VariableCaptureKind::This); + if (mapCaptureKey == "ByRef") + mapCapture = mlir::omp::VariableCaptureKindAttr::get( + parser.getContext(), mlir::omp::VariableCaptureKind::ByRef); + if (mapCaptureKey == "ByCopy") + mapCapture = mlir::omp::VariableCaptureKindAttr::get( + parser.getContext(), mlir::omp::VariableCaptureKind::ByCopy); + if (mapCaptureKey == "VLAType") + mapCapture = mlir::omp::VariableCaptureKindAttr::get( + parser.getContext(), mlir::omp::VariableCaptureKind::VLAType); - if (map_types->empty() && !map_operands.empty()) - return emitError(op->getLoc(), "missing mapTypes"); + return success(); +} - if (map_operands.size() != map_types->size()) - return emitError(op->getLoc(), - "mismatch in number of mapOperands and mapTypes"); +static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) { - for (const auto &mapTypeOp : *map_types) { - int64_t mapTypeBits = 0x00; + for (auto mapOp : mapOperands) { + if (!mapOp.getDefiningOp()) + emitError(op->getLoc(), "missing map operation"); - if (!llvm::isa(mapTypeOp)) - return failure(); + if (auto MapInfoOp = + mlir::dyn_cast(mapOp.getDefiningOp())) { + + if (!MapInfoOp.getMapType().has_value()) + emitError(op->getLoc(), "missing map type for map operand"); + + if (!MapInfoOp.getMapCaptureType().has_value()) + emitError(op->getLoc(), "missing map capture type for map operand"); + + uint64_t mapTypeBits = MapInfoOp.getMapType().value(); + + bool to = mapTypeToBitFlag( + mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); + bool from = mapTypeToBitFlag( + mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); + bool del = mapTypeToBitFlag( + mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE); + + if ((isa(op) || isa(op)) && del) + return emitError(op->getLoc(), + "to, from, tofrom and alloc map types are permitted"); - mapTypeBits = llvm::cast(mapTypeOp).getInt(); - - bool to = - bitAnd(mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO); - bool from = - bitAnd(mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM); - bool del = bitAnd(mapTypeBits, - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE); - - if ((isa(op) || isa(op)) && del) - return emitError(op->getLoc(), - "to, from, tofrom and alloc map types are permitted"); - if (isa(op) && (from || del)) - return emitError(op->getLoc(), "to and alloc map types are permitted"); - if (isa(op) && to) - return emitError(op->getLoc(), - "from, release and delete map types are permitted"); + if (isa(op) && (from || del)) + return emitError(op->getLoc(), "to and alloc map types are permitted"); + + if (isa(op) && to) + return emitError(op->getLoc(), + "from, release and delete map types are permitted"); + } else { + emitError(op->getLoc(), "map argument is not a map entry operation"); + } } return success(); @@ -852,19 +865,19 @@ LogicalResult DataOp::verify() { return ::emitError(this->getLoc(), "At least one of map, useDevicePtr, or " "useDeviceAddr operand must be present"); } - return verifyMapClause(*this, getMapOperands(), getMapTypes()); + return verifyMapClause(*this, getMapOperands()); } LogicalResult EnterDataOp::verify() { - return verifyMapClause(*this, getMapOperands(), getMapTypes()); + return verifyMapClause(*this, getMapOperands()); } LogicalResult ExitDataOp::verify() { - return verifyMapClause(*this, getMapOperands(), getMapTypes()); + return verifyMapClause(*this, getMapOperands()); } LogicalResult TargetOp::verify() { - return verifyMapClause(*this, getMapOperands(), getMapTypes()); + return verifyMapClause(*this, getMapOperands()); } //===----------------------------------------------------------------------===// @@ -1455,8 +1468,23 @@ LogicalResult CancellationPointOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// DataBoundsOp +//===----------------------------------------------------------------------===// + +LogicalResult DataBoundsOp::verify() { + auto extent = getExtent(); + auto upperbound = getUpperBound(); + if (!extent && !upperbound) + return emitError("expected extent or upperbound."); + return success(); +} + #define GET_ATTRDEF_CLASSES #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc" #define GET_OP_CLASSES #include "mlir/Dialect/OpenMP/OpenMPOps.cpp.inc" + +#define GET_TYPEDEF_CLASSES +#include "mlir/Dialect/OpenMP/OpenMPOpsTypes.cpp.inc" diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index 5565aefbad18d..45e68e23a71d6 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -530,7 +530,7 @@ ParseResult ForOp::parse(OpAsmParser &parser, OperationState &result) { return success(); } -Region &ForOp::getLoopBody() { return getRegion(); } +SmallVector ForOp::getLoopRegions() { return {&getRegion()}; } ForOp mlir::scf::getForInductionVarOwner(Value val) { auto ivArg = llvm::dyn_cast(val); @@ -558,11 +558,11 @@ void ForOp::getSuccessorRegions(RegionBranchPoint point, // Both the operation itself and the region may be branching into the body or // back into the operation itself. It is possible for loop not to enter the // body. - regions.push_back(RegionSuccessor(&getLoopBody(), getRegionIterArgs())); + regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); regions.push_back(RegionSuccessor(getResults())); } -Region &ForallOp::getLoopBody() { return getRegion(); } +SmallVector ForallOp::getLoopRegions() { return {&getRegion()}; } /// Promotes the loop body of a forallOp to its containing block if it can be /// determined that the loop has a single iteration. @@ -894,7 +894,7 @@ struct SimplifyTrivialLoops : public OpRewritePattern { blockArgs.reserve(op.getInitArgs().size() + 1); blockArgs.push_back(op.getLowerBound()); llvm::append_range(blockArgs, op.getInitArgs()); - replaceOpWithRegion(rewriter, op, op.getLoopBody(), blockArgs); + replaceOpWithRegion(rewriter, op, op.getRegion(), blockArgs); return success(); } @@ -932,7 +932,7 @@ replaceTensorCastForOpIterArg(PatternRewriter &rewriter, OpOperand &operand, assert(operand.get().getType() != replacement.getType() && "Expected a different type"); SmallVector newIterOperands; - for (OpOperand &opOperand : forOp.getIterOpOperands()) { + for (OpOperand &opOperand : forOp.getInitArgsMutable()) { if (opOperand.getOperandNumber() == operand.getOperandNumber()) { newIterOperands.push_back(replacement); continue; @@ -1015,7 +1015,7 @@ struct ForOpTensorCastFolder : public OpRewritePattern { LogicalResult matchAndRewrite(ForOp op, PatternRewriter &rewriter) const override { - for (auto it : llvm::zip(op.getIterOpOperands(), op.getResults())) { + for (auto it : llvm::zip(op.getInitArgsMutable(), op.getResults())) { OpOperand &iterOpOperand = std::get<0>(it); auto incomingCast = iterOpOperand.get().getDefiningOp(); if (!incomingCast || @@ -2872,7 +2872,7 @@ void ParallelOp::print(OpAsmPrinter &p) { /*elidedAttrs=*/ParallelOp::getOperandSegmentSizeAttr()); } -Region &ParallelOp::getLoopBody() { return getRegion(); } +SmallVector ParallelOp::getLoopRegions() { return {&getRegion()}; } ParallelOp mlir::scf::getParallelForInductionVarOwner(Value val) { auto ivArg = llvm::dyn_cast(val); @@ -2926,7 +2926,7 @@ struct ParallelOpSingleOrZeroIterationDimsFolder // loop body and nested ReduceOp's SmallVector results; results.reserve(op.getInitVals().size()); - for (auto &bodyOp : op.getLoopBody().front().without_terminator()) { + for (auto &bodyOp : op.getBody()->without_terminator()) { auto reduce = dyn_cast(bodyOp); if (!reduce) { rewriter.clone(bodyOp, mapping); @@ -2965,7 +2965,7 @@ struct MergeNestedParallelLoops : public OpRewritePattern { LogicalResult matchAndRewrite(ParallelOp op, PatternRewriter &rewriter) const override { - Block &outerBody = op.getLoopBody().front(); + Block &outerBody = *op.getBody(); if (!llvm::hasSingleElement(outerBody.without_terminator())) return failure(); @@ -2985,7 +2985,7 @@ struct MergeNestedParallelLoops : public OpRewritePattern { auto bodyBuilder = [&](OpBuilder &builder, Location /*loc*/, ValueRange iterVals, ValueRange) { - Block &innerBody = innerOp.getLoopBody().front(); + Block &innerBody = *innerOp.getBody(); assert(iterVals.size() == (outerBody.getNumArguments() + innerBody.getNumArguments())); IRMapping mapping; @@ -3203,6 +3203,10 @@ void WhileOp::getSuccessorRegions(RegionBranchPoint point, regions.emplace_back(&getAfter(), getAfter().getArguments()); } +SmallVector WhileOp::getLoopRegions() { + return {&getBefore(), &getAfter()}; +} + /// Parses a `while` op. /// /// op ::= `scf.while` assignments `:` function-type region `do` region diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp index 47f6da25b3251..88c6f3da656f3 100644 --- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp @@ -35,11 +35,8 @@ struct ForOpInterface // An EQ constraint can be added if the yielded value (dimension size) // equals the corresponding block argument (dimension size). - assert(forOp.getLoopBody().hasOneBlock() && - "multiple blocks not supported"); - Value yieldedValue = - cast(forOp.getLoopBody().front().getTerminator()) - .getOperand(iterArgIdx); + Value yieldedValue = cast(forOp.getBody()->getTerminator()) + .getOperand(iterArgIdx); Value iterArg = forOp.getRegionIterArg(iterArgIdx); Value initArg = forOp.getInitArgs()[iterArgIdx]; @@ -68,7 +65,7 @@ struct ForOpInterface // Stop when reaching a value that is defined outside of the loop. It // is impossible to reach an iter_arg from there. Operation *op = v.getDefiningOp(); - return forOp.getLoopBody().findAncestorOpInRegion(*op) == nullptr; + return forOp.getRegion().findAncestorOpInRegion(*op) == nullptr; }); if (failed(status)) return; diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp index 665171dca4e28..bcbc693a9742c 100644 --- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp @@ -325,7 +325,7 @@ DenseSet getEquivalentBuffers(Block::BlockArgListType bbArgs, /// Helper function for loop bufferization. Return the bufferized values of the /// given OpOperands. If an operand is not a tensor, return the original value. static FailureOr> -getBuffers(RewriterBase &rewriter, MutableArrayRef operands, +getBuffers(RewriterBase &rewriter, MutableOperandRange operands, const BufferizationOptions &options) { SmallVector result; for (OpOperand &opOperand : operands) { @@ -489,8 +489,7 @@ struct ForOpInterface auto forOp = cast(op); OpOperand &forOperand = forOp.getOpOperandForResult(opResult); auto bbArg = forOp.getRegionIterArgForOpOperand(forOperand); - auto yieldOp = - cast(forOp.getLoopBody().front().getTerminator()); + auto yieldOp = cast(forOp.getBody()->getTerminator()); bool equivalentYield = state.areEquivalentBufferizedValues( bbArg, yieldOp->getOperand(opResult.getResultNumber())); return equivalentYield ? BufferRelation::Equivalent @@ -525,8 +524,7 @@ struct ForOpInterface // satisfied. Otherwise, we cannot be sure and must yield a new buffer copy. // (New buffer copies do not alias with any buffer.) auto forOp = cast(op); - auto yieldOp = - cast(forOp.getLoopBody().front().getTerminator()); + auto yieldOp = cast(forOp.getBody()->getTerminator()); OpBuilder::InsertionGuard g(rewriter); rewriter.setInsertionPoint(yieldOp); @@ -578,8 +576,7 @@ struct ForOpInterface .getResultNumber(); // Compute the bufferized type. - auto yieldOp = - cast(forOp.getLoopBody().front().getTerminator()); + auto yieldOp = cast(forOp.getBody()->getTerminator()); Value yieldedValue = yieldOp.getOperand(resultNum); BlockArgument iterArg = forOp.getRegionIterArgs()[resultNum]; Value initArg = forOp.getInitArgs()[resultNum]; @@ -590,7 +587,7 @@ struct ForOpInterface LogicalResult bufferize(Operation *op, RewriterBase &rewriter, const BufferizationOptions &options) const { auto forOp = cast(op); - Block *oldLoopBody = &forOp.getLoopBody().front(); + Block *oldLoopBody = forOp.getBody(); // Indices of all iter_args that have tensor type. These are the ones that // are bufferized. @@ -598,7 +595,7 @@ struct ForOpInterface // The new memref init_args of the loop. FailureOr> maybeInitArgs = - getBuffers(rewriter, forOp.getIterOpOperands(), options); + getBuffers(rewriter, forOp.getInitArgsMutable(), options); if (failed(maybeInitArgs)) return failure(); SmallVector initArgs = *maybeInitArgs; @@ -624,7 +621,7 @@ struct ForOpInterface forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(), castedInitArgs); newForOp->setAttrs(forOp->getAttrs()); - Block *loopBody = &newForOp.getLoopBody().front(); + Block *loopBody = newForOp.getBody(); // Set up new iter_args. The loop body uses tensors, so wrap the (memref) // iter_args of the new loop in ToTensorOps. @@ -657,8 +654,7 @@ struct ForOpInterface return success(); auto forOp = cast(op); - auto yieldOp = - cast(forOp.getLoopBody().front().getTerminator()); + auto yieldOp = cast(forOp.getBody()->getTerminator()); for (OpResult opResult : op->getOpResults()) { if (!isa(opResult.getType())) continue; @@ -816,7 +812,7 @@ struct WhileOpInterface // The new memref init_args of the loop. FailureOr> maybeInitArgs = - getBuffers(rewriter, whileOp->getOpOperands(), options); + getBuffers(rewriter, whileOp.getInitsMutable(), options); if (failed(maybeInitArgs)) return failure(); SmallVector initArgs = *maybeInitArgs; diff --git a/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp b/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp index 35d242327178e..7932c38a3e8d8 100644 --- a/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp @@ -126,7 +126,7 @@ class ConvertForOpTypes // new op's regions doesn't remove the child ops from the worklist). // convertRegionTypes already takes care of 1:N conversion. - if (failed(rewriter.convertRegionTypes(&op.getLoopBody(), *typeConverter))) + if (failed(rewriter.convertRegionTypes(&op.getRegion(), *typeConverter))) return std::nullopt; // Unpacked the iteration arguments. @@ -146,8 +146,8 @@ class ConvertForOpTypes // We do not need the empty block created by rewriter. rewriter.eraseBlock(newOp.getBody(0)); // Inline the type converted region from the original operation. - rewriter.inlineRegionBefore(op.getLoopBody(), newOp.getLoopBody(), - newOp.getLoopBody().end()); + rewriter.inlineRegionBefore(op.getRegion(), newOp.getRegion(), + newOp.getRegion().end()); return newOp; } diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 1ce25565edcaf..6cfba3fef15eb 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -31,19 +31,11 @@ using namespace mlir; scf::SCFTilingOptions & -scf::SCFTilingOptions::setTileSizes(ArrayRef ts) { +scf::SCFTilingOptions::setTileSizes(ArrayRef ts) { assert(!tileSizeComputationFunction && "tile sizes already set"); - SmallVector tileSizes(ts.begin(), ts.end()); + auto tileSizes = llvm::to_vector(ts); tileSizeComputationFunction = [tileSizes](OpBuilder &b, Operation *op) { - OpBuilder::InsertionGuard guard(b); - b.setInsertionPointToStart( - &op->getParentWithTrait() - ->getRegion(0) - .front()); - return llvm::to_vector<4>(map_range(tileSizes, [&](int64_t s) { - Value v = b.create(op->getLoc(), s); - return v; - })); + return tileSizes; }; return *this; } @@ -108,17 +100,16 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, /// Generate an empty loop nest that represents the tiled loop nest shell. /// - `loopRanges` specifies the lb, ub and step of the untiled iteration space. -/// - `tileSizeVals` is the tile sizes to use. Zero represent untiled loops. +/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops. /// - In `offsets` and `sizes` return the multi-dimensional offset and size of /// the /// tile processed within the inner most loop. -static SmallVector -generateTileLoopNest(OpBuilder &builder, Location loc, - ArrayRef loopRanges, ArrayRef tileSizeVals, - SmallVector &offsets, - SmallVector &sizes) { +static SmallVector generateTileLoopNest( + OpBuilder &builder, Location loc, ArrayRef loopRanges, + ArrayRef tileSizes, SmallVector &offsets, + SmallVector &sizes) { assert(!loopRanges.empty() && "expected at least one loop range"); - assert(loopRanges.size() == tileSizeVals.size() && + assert(loopRanges.size() == tileSizes.size() && "expected as many tile sizes as loop ranges"); OpBuilder::InsertionGuard guard(builder); SmallVector loops; @@ -130,7 +121,8 @@ generateTileLoopNest(OpBuilder &builder, Location loc, getValueOrCreateConstantIndexOp(builder, loc, loopRange.value().offset); Value size = getValueOrCreateConstantIndexOp(builder, loc, loopRange.value().size); - Value tileSize = tileSizeVals[loopRange.index()]; + Value tileSize = getValueOrCreateConstantIndexOp( + builder, loc, tileSizes[loopRange.index()]); // No loops if tile size is zero. Set offset and size to the loop // offset and size. if (matchPattern(tileSize, m_Zero())) { @@ -296,10 +288,10 @@ mlir::scf::tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op, // skips tiling a particular dimension. This convention is significantly // simpler to handle instead of adjusting affine maps to account for missing // dimensions. - SmallVector tileSizeVector = + SmallVector tileSizeVector = options.tileSizeComputationFunction(rewriter, op); if (tileSizeVector.size() < iterationDomain.size()) { - auto zero = rewriter.create(op.getLoc(), 0); + auto zero = rewriter.getIndexAttr(0); tileSizeVector.append(numLoops - tileSizeVector.size(), zero); } @@ -402,17 +394,17 @@ mlir::scf::tileUsingSCFForOp(RewriterBase &rewriter, TilingInterface op, FailureOr mlir::scf::tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op, - ArrayRef tileSize) { + ArrayRef tileSizes) { Location loc = op.getLoc(); // Ops implementing PartialReductionOpInterface are expected to implement // TilingInterface. auto tilingInterfaceOp = cast(op.getOperation()); SmallVector iterationDomain = tilingInterfaceOp.getIterationDomain(b); - SmallVector tileSizeVector = - getValueOrCreateConstantIndexOp(b, loc, tileSize); - if (tileSizeVector.size() < iterationDomain.size()) { - auto zero = b.create(loc, 0); - tileSizeVector.append(iterationDomain.size() - tileSizeVector.size(), zero); + auto tileSizesVector = llvm::to_vector(tileSizes); + if (tileSizesVector.size() < iterationDomain.size()) { + auto zero = b.getIndexAttr(0); + tileSizesVector.append(iterationDomain.size() - tileSizesVector.size(), + zero); } if (op->getNumResults() != 1) return b.notifyMatchFailure( @@ -429,7 +421,7 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b, // 1. create the inital tensor value. FailureOr identityTensor = - op.generateInitialTensorForPartialReduction(b, loc, tileSize, + op.generateInitialTensorForPartialReduction(b, loc, tileSizesVector, reductionDims); if (failed(identityTensor)) return b.notifyMatchFailure(op, @@ -437,7 +429,7 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b, // 2. Create the nested loops. SmallVector offsets, sizes; SmallVector loops = generateTileLoopNest( - b, loc, iterationDomain, tileSizeVector, offsets, sizes); + b, loc, iterationDomain, tileSizesVector, offsets, sizes); // 3. Generate the tiled implementation within the inner most loop. b.setInsertionPoint(loops.back().getBody()->getTerminator()); @@ -508,7 +500,7 @@ mlir::scf::tileAndFuseProducerOfSlice(RewriterBase &rewriter, MutableArrayRef loops) { // 1. Get the producer of the source (potentially walking through // `iter_args` of nested `scf.for`) - auto [fusableProducer, destinationIterArg] = + auto [fusableProducer, destinationInitArg] = getUntiledProducerFromSliceSource(&candidateSliceOp.getSourceMutable()[0], loops); if (!fusableProducer) @@ -575,17 +567,16 @@ mlir::scf::tileAndFuseProducerOfSlice(RewriterBase &rewriter, // TODO: This can be modeled better if the `DestinationStyleOpInterface`. // Update to use that when it does become available. scf::ForOp outerMostLoop = loops.front(); - std::optional iterArgNumber; - if (destinationIterArg) { - iterArgNumber = - outerMostLoop.getIterArgNumberForOpOperand(*destinationIterArg.value()); - } - if (iterArgNumber) { + if (destinationInitArg && + (*destinationInitArg)->getOwner() == outerMostLoop) { + unsigned iterArgNumber = + outerMostLoop.getResultForOpOperand(**destinationInitArg) + .getResultNumber(); int64_t resultNumber = fusableProducer.getResultNumber(); if (auto dstOp = dyn_cast(fusableProducer.getOwner())) { - outerMostLoop.setIterArg(iterArgNumber.value(), - dstOp.getTiedOpOperand(fusableProducer)->get()); + (*destinationInitArg) + ->set(dstOp.getTiedOpOperand(fusableProducer)->get()); } for (auto tileAndFusedOp : tileAndFuseResult->tiledOps) { auto dstOp = dyn_cast(tileAndFusedOp); @@ -594,7 +585,7 @@ mlir::scf::tileAndFuseProducerOfSlice(RewriterBase &rewriter, scf::ForOp innerMostLoop = loops.back(); updateDestinationOperandsForTiledOp( rewriter, dstOp.getDpsInitOperand(resultNumber)->get(), - innerMostLoop.getRegionIterArgs()[iterArgNumber.value()]); + innerMostLoop.getRegionIterArgs()[iterArgNumber]); } } return scf::SCFFuseProducerOfSliceResult{fusableProducer, diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index 222a9aa395c4f..411503700eb01 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -1006,9 +1006,9 @@ scf::ForallOp mlir::fuseIndependentSiblingForallLoops(scf::ForallOp target, // Append everything except the terminator into the fused operation. rewriter.setInsertionPointToStart(fusedLoop.getBody()); - for (Operation &op : target.getLoopBody().begin()->without_terminator()) + for (Operation &op : target.getBody()->without_terminator()) rewriter.clone(op, fusedMapping); - for (Operation &op : source.getLoopBody().begin()->without_terminator()) + for (Operation &op : source.getBody()->without_terminator()) rewriter.clone(op, fusedMapping); // Fuse the old terminator in_parallel ops into the new one. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp index 5e53dbe1cc283..029ecb0708941 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseBufferRewriting.cpp @@ -563,11 +563,14 @@ static void createChoosePivot(OpBuilder &builder, ModuleOp module, // p = (lo+hi)/2 // pivot index // i = lo // j = hi-1 -// while (i < j) do { +// while (true) do { // while (xs[i] < xs[p]) i ++; // i_eq = (xs[i] == xs[p]); // while (xs[j] > xs[p]) j --; // j_eq = (xs[j] == xs[p]); +// +// if (i >= j) return j + 1; +// // if (i < j) { // swap(xs[i], xs[j]) // if (i == p) { @@ -581,8 +584,7 @@ static void createChoosePivot(OpBuilder &builder, ModuleOp module, // } // } // } -// return p -// } +// } static void createPartitionFunc(OpBuilder &builder, ModuleOp module, func::FuncOp func, uint64_t nx, uint64_t ny, bool isCoo, uint32_t nTrailingP = 0) { @@ -605,22 +607,22 @@ static void createPartitionFunc(OpBuilder &builder, ModuleOp module, Value i = lo; Value j = builder.create(loc, hi, c1); createChoosePivot(builder, module, func, nx, ny, isCoo, i, j, p, args); - SmallVector operands{i, j, p}; // Exactly three values. - SmallVector types{i.getType(), j.getType(), p.getType()}; + Value trueVal = constantI1(builder, loc, true); // The value for while (true) + SmallVector operands{i, j, p, trueVal}; // Exactly four values. + SmallVector types{i.getType(), j.getType(), p.getType(), + trueVal.getType()}; scf::WhileOp whileOp = builder.create(loc, types, operands); // The before-region of the WhileOp. - Block *before = - builder.createBlock(&whileOp.getBefore(), {}, types, {loc, loc, loc}); + Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, + {loc, loc, loc, loc}); builder.setInsertionPointToEnd(before); - Value cond = builder.create(loc, arith::CmpIPredicate::ult, - before->getArgument(0), - before->getArgument(1)); - builder.create(loc, cond, before->getArguments()); + builder.create(loc, before->getArgument(3), + before->getArguments()); // The after-region of the WhileOp. Block *after = - builder.createBlock(&whileOp.getAfter(), {}, types, {loc, loc, loc}); + builder.createBlock(&whileOp.getAfter(), {}, types, {loc, loc, loc, loc}); builder.setInsertionPointToEnd(after); i = after->getArgument(0); j = after->getArgument(1); @@ -637,7 +639,8 @@ static void createPartitionFunc(OpBuilder &builder, ModuleOp module, j = jresult; // If i < j: - cond = builder.create(loc, arith::CmpIPredicate::ult, i, j); + Value cond = + builder.create(loc, arith::CmpIPredicate::ult, i, j); scf::IfOp ifOp = builder.create(loc, types, cond, /*else=*/true); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); SmallVector swapOperands{i, j}; @@ -675,11 +678,15 @@ static void createPartitionFunc(OpBuilder &builder, ModuleOp module, builder.setInsertionPointAfter(ifOp2); builder.create( loc, - ValueRange{ifOp2.getResult(0), ifOp2.getResult(1), ifOpI.getResult(0)}); + ValueRange{ifOp2.getResult(0), ifOp2.getResult(1), ifOpI.getResult(0), + /*cont=*/constantI1(builder, loc, true)}); - // False branch for if i < j: + // False branch for if i < j (i.e., i >= j): builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); - builder.create(loc, ValueRange{i, j, p}); + p = builder.create(loc, j, + constantOne(builder, loc, j.getType())); + builder.create( + loc, ValueRange{i, j, p, /*cont=*/constantI1(builder, loc, false)}); // Return for the whileOp. builder.setInsertionPointAfter(ifOp); @@ -927,6 +934,8 @@ createQuickSort(OpBuilder &builder, ModuleOp module, func::FuncOp func, Location loc = func.getLoc(); Value lo = args[loIdx]; Value hi = args[hiIdx]; + SmallVector types(2, lo.getType()); // Only two types. + FlatSymbolRefAttr partitionFunc = getMangledSortHelperFunc( builder, func, {IndexType::get(context)}, kPartitionFuncNamePrefix, nx, ny, isCoo, args.drop_back(nTrailingP), createPartitionFunc); @@ -935,14 +944,25 @@ createQuickSort(OpBuilder &builder, ModuleOp module, func::FuncOp func, TypeRange{IndexType::get(context)}, args.drop_back(nTrailingP)) .getResult(0); - Value pP1 = - builder.create(loc, p, constantIndex(builder, loc, 1)); + Value lenLow = builder.create(loc, p, lo); Value lenHigh = builder.create(loc, hi, p); + // Partition already sorts array with len <= 2 + Value c2 = constantIndex(builder, loc, 2); + Value len = builder.create(loc, hi, lo); + Value lenGtTwo = + builder.create(loc, arith::CmpIPredicate::ugt, len, c2); + scf::IfOp ifLenGtTwo = + builder.create(loc, types, lenGtTwo, /*else=*/true); + builder.setInsertionPointToStart(&ifLenGtTwo.getElseRegion().front()); + // Returns an empty range to mark the entire region is fully sorted. + builder.create(loc, ValueRange{lo, lo}); + + // Else len > 2, need recursion. + builder.setInsertionPointToStart(&ifLenGtTwo.getThenRegion().front()); Value cond = builder.create(loc, arith::CmpIPredicate::ule, lenLow, lenHigh); - SmallVector types(2, lo.getType()); // Only two types. scf::IfOp ifOp = builder.create(loc, types, cond, /*else=*/true); Value c0 = constantIndex(builder, loc, 0); @@ -961,14 +981,17 @@ createQuickSort(OpBuilder &builder, ModuleOp module, func::FuncOp func, // the bigger partition to be processed by the enclosed while-loop. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); mayRecursion(lo, p, lenLow); - builder.create(loc, ValueRange{pP1, hi}); + builder.create(loc, ValueRange{p, hi}); builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); - mayRecursion(pP1, hi, lenHigh); + mayRecursion(p, hi, lenHigh); builder.create(loc, ValueRange{lo, p}); builder.setInsertionPointAfter(ifOp); - return std::make_pair(ifOp.getResult(0), ifOp.getResult(1)); + builder.create(loc, ifOp.getResults()); + + builder.setInsertionPointAfter(ifLenGtTwo); + return std::make_pair(ifLenGtTwo.getResult(0), ifLenGtTwo.getResult(1)); } /// Creates a function to perform insertion sort on the values in the range of diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp index 737058c543dac..efdd3347558b4 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -300,8 +300,8 @@ static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc, // } Value upper = irMap.lookup(forallOp.getUpperBound()[0]); scf::ForOp forOp = rewriter.create(loc, row, upper, inc); - rewriter.cloneRegionBefore(forallOp.getLoopBody(), forOp.getLoopBody(), - forOp.getLoopBody().begin(), irMap); + rewriter.cloneRegionBefore(forallOp.getRegion(), forOp.getRegion(), + forOp.getRegion().begin(), irMap); // Done. rewriter.setInsertionPointAfter(forOp); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp index 871686a4ada0f..d75601e369a0d 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp @@ -830,6 +830,7 @@ class SparseTensorNewConverter : public OpConversionPattern { }; /// Sparse conversion rule for the alloc operator. +/// TODO(springerm): remove when bufferization.alloc_tensor is gone class SparseTensorAllocConverter : public OpConversionPattern { public: @@ -864,6 +865,37 @@ class SparseTensorAllocConverter } }; +/// Sparse conversion rule for the empty tensor. +class SparseTensorEmptyConverter : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(tensor::EmptyOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + const auto stt = getSparseTensorType(op); + if (!stt.hasEncoding()) + return failure(); + // Gather all dimension sizes as SSA values. + const Dimension dimRank = stt.getDimRank(); + SmallVector dimSizes; + dimSizes.reserve(dimRank); + auto shape = op.getType().getShape(); + unsigned operandCtr = 0; + for (Dimension d = 0; d < dimRank; ++d) { + dimSizes.push_back(stt.isDynamicDim(d) + ? adaptor.getOperands()[operandCtr++] + : constantIndex(rewriter, loc, shape[d])); + } + // Generate the call to construct empty tensor. The sizes are + // explicitly defined by the arguments to the alloc operator. + rewriter.replaceOp(op, NewCallParams(rewriter, loc) + .genBuffers(stt, dimSizes) + .genNewCall(Action::kEmpty)); + return success(); + } +}; + /// Sparse conversion rule for the convert operator. class SparseTensorConvertConverter : public OpConversionPattern { public: @@ -1503,19 +1535,19 @@ mlir::SparseTensorTypeToPtrConverter::SparseTensorTypeToPtrConverter() { void mlir::populateSparseTensorConversionPatterns( TypeConverter &typeConverter, RewritePatternSet &patterns, const SparseTensorConversionOptions &options) { - patterns.add, - SparseReshapeConverter, - SparseTensorConcatConverter, SparseTensorAllocConverter, - SparseTensorDeallocConverter, SparseTensorToPositionsConverter, - SparseTensorToCoordinatesConverter, - SparseTensorToValuesConverter, SparseNumberOfEntriesConverter, - SparseTensorLoadConverter, SparseTensorInsertConverter, - SparseTensorExpandConverter, SparseTensorCompressConverter, - SparseTensorOutConverter, SparseTensorPackConverter>( - typeConverter, patterns.getContext()); - + patterns + .add, + SparseReshapeConverter, + SparseTensorConcatConverter, SparseTensorAllocConverter, + SparseTensorEmptyConverter, SparseTensorDeallocConverter, + SparseTensorToPositionsConverter, SparseTensorToCoordinatesConverter, + SparseTensorToValuesConverter, SparseNumberOfEntriesConverter, + SparseTensorLoadConverter, SparseTensorInsertConverter, + SparseTensorExpandConverter, SparseTensorCompressConverter, + SparseTensorOutConverter, SparseTensorPackConverter>( + typeConverter, patterns.getContext()); patterns.add(typeConverter, patterns.getContext(), options); } diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp index 1535e83376ede..b08283f007078 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -865,6 +865,20 @@ struct ReshapeOpInterface bufferization::getBufferType(reshapeOp.getResult(), options); if (failed(maybeResultMemRefType)) return failure(); + + // memref.reshape requires the source buffer to have an identity layout. + // If the source memref does not have an identity layout, clone the source + // into a new buffer with an identity layout. + auto srcType = llvm::dyn_cast(srcBuffer->getType()); + if (srcType && !srcType.getLayout().isIdentity()) { + auto identityType = + MemRefType::get(srcType.getShape(), srcType.getElementType()); + srcBuffer = rewriter + .create(op->getLoc(), + identityType, *srcBuffer) + .getResult(); + } + replaceOpWithNewBufferizedOp( rewriter, op, maybeResultMemRefType.value(), *srcBuffer, *shapeBuffer); return success(); diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 0696bf386bf75..616aad8c4aaf0 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -69,7 +69,7 @@ struct TosaInlinerInterface : public DialectInlinerInterface { //===----------------------------------------------------------------------===// /// Returns the while loop body. -Region &tosa::WhileOp::getLoopBody() { return getBody(); } +SmallVector tosa::WhileOp::getLoopRegions() { return {&getBody()}; } //===----------------------------------------------------------------------===// // Tosa dialect initialization. diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp index f1d07b85adb75..dc004cd14dc0f 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp @@ -2188,6 +2188,9 @@ LogicalResult transform::SequenceOp::verify() { } } + if (!getBodyBlock()->hasTerminator()) + return emitOpError() << "expects to have a terminator in the body"; + if (getBodyBlock()->getTerminator()->getOperandTypes() != getOperation()->getResultTypes()) { InFlightDiagnostic diag = emitOpError() diff --git a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp index 7b17e231ce106..b0c50f3d6e298 100644 --- a/mlir/lib/IR/OperationSupport.cpp +++ b/mlir/lib/IR/OperationSupport.cpp @@ -522,6 +522,14 @@ OpOperand &MutableOperandRange::operator[](unsigned index) const { return owner->getOpOperand(start + index); } +MutableArrayRef::iterator MutableOperandRange::begin() const { + return owner->getOpOperands().slice(start, length).begin(); +} + +MutableArrayRef::iterator MutableOperandRange::end() const { + return owner->getOpOperands().slice(start, length).end(); +} + //===----------------------------------------------------------------------===// // MutableOperandRangeRange diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index dcc5e3823d663..006ff515a1894 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1540,27 +1540,34 @@ static void genMapInfos(llvm::IRBuilderBase &builder, unsigned index = 0; for (const auto &mapOp : mapOperands) { + // Unlike dev_ptr and dev_addr operands these map operands point + // to a map entry operation which contains further information + // on the variable being mapped and how it should be mapped. + auto MapInfoOp = + mlir::dyn_cast(mapOp.getDefiningOp()); + // TODO: Only LLVMPointerTypes are handled. - if (!mapOp.getType().isa()) + if (!MapInfoOp.getType().isa()) return fail(); - llvm::Value *mapOpValue = moduleTranslation.lookupValue(mapOp); + llvm::Value *mapOpValue = + moduleTranslation.lookupValue(MapInfoOp.getVarPtr()); combinedInfo.BasePointers.emplace_back(mapOpValue); combinedInfo.Pointers.emplace_back(mapOpValue); combinedInfo.DevicePointers.emplace_back( llvm::OpenMPIRBuilder::DeviceInfoTy::None); - combinedInfo.Names.emplace_back( - LLVM::createMappingInformation(mapOp.getLoc(), *ompBuilder)); + combinedInfo.Names.emplace_back(LLVM::createMappingInformation( + MapInfoOp.getVarPtr().getLoc(), *ompBuilder)); combinedInfo.Types.emplace_back( llvm::omp::OpenMPOffloadMappingFlags( - mapTypes[index].dyn_cast().getInt()) | + mapTypes[index].dyn_cast().getUInt()) | (IsTargetParams ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE)); combinedInfo.Sizes.emplace_back( - builder.getInt64(getSizeInBytes(DL, mapOp.getType()))); + builder.getInt64(getSizeInBytes(DL, MapInfoOp.getVarPtr().getType()))); index++; } @@ -1609,6 +1616,19 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + auto getMapTypes = [](mlir::OperandRange mapOperands, + mlir::MLIRContext *ctx) { + SmallVector mapTypes; + for (auto mapValue : mapOperands) { + if (mapValue.getDefiningOp()) { + auto mapOp = + mlir::dyn_cast(mapValue.getDefiningOp()); + mapTypes.push_back(mapOp.getMapTypeAttr()); + } + } + return mlir::ArrayAttr::get(ctx, mapTypes); + }; + LogicalResult result = llvm::TypeSwitch(op) .Case([&](omp::DataOp dataOp) { @@ -1622,8 +1642,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, deviceID = intAttr.getInt(); mapOperands = dataOp.getMapOperands(); - if (dataOp.getMapTypes()) - mapTypes = dataOp.getMapTypes().value(); + mapTypes = + getMapTypes(dataOp.getMapOperands(), dataOp->getContext()); useDevPtrOperands = dataOp.getUseDevicePtr(); useDevAddrOperands = dataOp.getUseDeviceAddr(); return success(); @@ -1642,7 +1662,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, deviceID = intAttr.getInt(); RTLFn = llvm::omp::OMPRTL___tgt_target_data_begin_mapper; mapOperands = enterDataOp.getMapOperands(); - mapTypes = enterDataOp.getMapTypes(); + mapTypes = getMapTypes(enterDataOp.getMapOperands(), + enterDataOp->getContext()); return success(); }) .Case([&](omp::ExitDataOp exitDataOp) { @@ -1660,7 +1681,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, RTLFn = llvm::omp::OMPRTL___tgt_target_data_end_mapper; mapOperands = exitDataOp.getMapOperands(); - mapTypes = exitDataOp.getMapTypes(); + mapTypes = getMapTypes(exitDataOp.getMapOperands(), + exitDataOp->getContext()); return success(); }) .Default([&](Operation *op) { @@ -1887,7 +1909,22 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, DataLayout DL = DataLayout(opInst.getParentOfType()); SmallVector mapOperands = targetOp.getMapOperands(); - ArrayAttr mapTypes = targetOp.getMapTypes().value_or(nullptr); + + auto getMapTypes = [](mlir::OperandRange mapOperands, + mlir::MLIRContext *ctx) { + SmallVector mapTypes; + for (auto mapValue : mapOperands) { + if (mapValue.getDefiningOp()) { + auto mapOp = + mlir::dyn_cast(mapValue.getDefiningOp()); + mapTypes.push_back(mapOp.getMapTypeAttr()); + } + } + return mlir::ArrayAttr::get(ctx, mapTypes); + }; + + ArrayAttr mapTypes = + getMapTypes(targetOp.getMapOperands(), targetOp->getContext()); llvm::OpenMPIRBuilder::MapInfosTy combinedInfos; auto genMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP) @@ -2170,6 +2207,12 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation( .Case([&](omp::TargetOp) { return convertOmpTarget(*op, builder, moduleTranslation); }) + .Case([&](auto op) { + // No-op, should be handled by relevant owning operations e.g. + // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then + // discarded + return success(); + }) .Default([&](Operation *inst) { return inst->emitError("unsupported OpenMP operation: ") << inst->getName(); diff --git a/mlir/lib/Transforms/Utils/CFGToSCF.cpp b/mlir/lib/Transforms/Utils/CFGToSCF.cpp index 9aab89ed75536..e7bf6628ccbd7 100644 --- a/mlir/lib/Transforms/Utils/CFGToSCF.cpp +++ b/mlir/lib/Transforms/Utils/CFGToSCF.cpp @@ -137,6 +137,13 @@ getMutableSuccessorOperands(Block *block, unsigned successorIndex) { return succOps.getMutableForwardedOperands(); } +/// Return the operand range used to transfer operands from `block` to its +/// successor with the given index. +static OperandRange getSuccessorOperands(Block *block, + unsigned successorIndex) { + return getMutableSuccessorOperands(block, successorIndex); +} + /// Appends all the block arguments from `other` to the block arguments of /// `block`, copying their types and locations. static void addBlockArgumentsFromOther(Block *block, Block *other) { @@ -175,8 +182,14 @@ class Edge { /// Returns the arguments of this edge that are passed to the block arguments /// of the successor. - MutableOperandRange getSuccessorOperands() const { - return getMutableSuccessorOperands(fromBlock, successorIndex); + MutableOperandRange getMutableSuccessorOperands() const { + return ::getMutableSuccessorOperands(fromBlock, successorIndex); + } + + /// Returns the arguments of this edge that are passed to the block arguments + /// of the successor. + OperandRange getSuccessorOperands() const { + return ::getSuccessorOperands(fromBlock, successorIndex); } }; @@ -262,7 +275,7 @@ class EdgeMultiplexer { assert(result != blockArgMapping.end() && "Edge was not originally passed to `create` method."); - MutableOperandRange successorOperands = edge.getSuccessorOperands(); + MutableOperandRange successorOperands = edge.getMutableSuccessorOperands(); // Extra arguments are always appended at the end of the block arguments. unsigned extraArgsBeginIndex = @@ -666,7 +679,7 @@ transformToReduceLoop(Block *loopHeader, Block *exitBlock, // invalidated when mutating the operands through a different // `MutableOperandRange` of the same operation. SmallVector loopHeaderSuccessorOperands = - llvm::to_vector(getMutableSuccessorOperands(latch, loopHeaderIndex)); + llvm::to_vector(getSuccessorOperands(latch, loopHeaderIndex)); // Add all values used in the next iteration to the exit block. Replace // any uses that are outside the loop with the newly created exit block. @@ -742,7 +755,7 @@ transformToReduceLoop(Block *loopHeader, Block *exitBlock, loopHeaderSuccessorOperands.push_back(argument); for (Edge edge : successorEdges(latch)) - edge.getSuccessorOperands().append(argument); + edge.getMutableSuccessorOperands().append(argument); } use.set(blockArgument); @@ -939,9 +952,8 @@ static FailureOr> transformToStructuredCFBranches( if (regionEntry->getNumSuccessors() == 1) { // Single successor we can just splice together. Block *successor = regionEntry->getSuccessor(0); - for (auto &&[oldValue, newValue] : - llvm::zip(successor->getArguments(), - getMutableSuccessorOperands(regionEntry, 0))) + for (auto &&[oldValue, newValue] : llvm::zip( + successor->getArguments(), getSuccessorOperands(regionEntry, 0))) oldValue.replaceAllUsesWith(newValue); regionEntry->getTerminator()->erase(); diff --git a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp index d546ab3be8bda..080492da6ae4b 100644 --- a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp @@ -48,7 +48,7 @@ static bool canBeHoisted(Operation *op, } size_t mlir::moveLoopInvariantCode( - RegionRange regions, + ArrayRef regions, function_ref isDefinedOutsideRegion, function_ref shouldMoveOutOfRegion, function_ref moveOutOfRegion) { @@ -96,7 +96,7 @@ size_t mlir::moveLoopInvariantCode( size_t mlir::moveLoopInvariantCode(LoopLikeOpInterface loopLike) { return moveLoopInvariantCode( - &loopLike.getLoopBody(), + loopLike.getLoopRegions(), [&](Value value, Region *) { return loopLike.isDefinedOutsideOfLoop(value); }, diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index 23f4687d0394d..e8f4440d216ee 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -43,11 +43,13 @@ __all__ = [ "DenseElementsAttr", "DenseFPElementsAttr", "DenseIntElementsAttr", + "DenseResourceElementsAttr", "Dialect", "DialectDescriptor", "Dialects", "Diagnostic", "DiagnosticHandler", + "DiagnosticInfo", "DiagnosticSeverity", "DictAttr", "Float8E4M3FNType", @@ -74,6 +76,7 @@ __all__ = [ "Location", "MemRefType", "Module", + "MLIRError", "NamedAttribute", "NoneType", "OpaqueType", @@ -123,10 +126,16 @@ class _OperationBase: @property def attributes(self) -> OpAttributeMap: ... @property + def context(self) -> Context: ... + @property def location(self) -> Location: ... @property + def name(self) -> str: ... + @property def operands(self) -> OpOperandList: ... @property + @property + def parent(self) -> Optional[_OperationBase]: ... def regions(self) -> RegionSequence: ... @property def result(self) -> OpResult: ... @@ -530,6 +539,10 @@ class DenseIntElementsAttr(DenseElementsAttr): @property def type(self) -> Type: ... +class DenseResourceElementsAttr(Attribute): + @staticmethod + def get_from_buffer(array: Any, name: str, type: Type, alignment: Optional[int] = None, is_mutable: bool = False, context: Optional[Context] = None) -> None: ... + class Dialect: def __init__(self, descriptor: DialectDescriptor) -> None: ... @property @@ -563,6 +576,17 @@ class DiagnosticHandler: def __enter__(self) -> DiagnosticHandler: ... def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ... +class DiagnosticInfo: + def __init__(self, diag: Diagnostic) -> None: ... + @property + def severity(self) -> "DiagnosticSeverity": ... + @property + def location(self) -> "Location": ... + @property + def message(self) -> str: ... + @property + def notes(self) -> Sequence["DiagnosticInfo"]: ... + class DiagnosticSeverity: ERROR: DiagnosticSeverity WARNING: DiagnosticSeverity @@ -871,6 +895,9 @@ class Module: @property def operation(self) -> Operation: ... +class MLIRError(Exception): + def __init__(self, message: str, error_diagnostics: List[DiagnosticInfo]) -> None: ... + class NamedAttribute: @property def attr(self) -> Attribute: ... @@ -950,9 +977,9 @@ class OpView(_OperationBase): loc: Optional[Location] = None, ip: Optional[InsertionPoint] = None) -> _TOperation: ... @property - def context(self) -> Context: ... - @property def operation(self) -> Operation: ... + @property + def opview(self) -> "OpView": ... class Operation(_OperationBase): def _CAPICreate(self) -> object: ... @@ -968,13 +995,9 @@ class Operation(_OperationBase): @property def _CAPIPtr(self) -> object: ... @property - def context(self) -> Context: ... - @property - def name(self) -> str: ... + def operation(self) -> "Operation": ... @property def opview(self) -> OpView: ... - @property - def parent(self) -> Optional[_OperationBase]: ... class OperationIterator: def __iter__(self) -> OperationIterator: ... diff --git a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi index 44d22255e3110..c072d5e0fb86f 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi @@ -20,6 +20,6 @@ class PassManager: def enable_verifier(self, enable: bool) -> None: ... @staticmethod def parse(pipeline: str, context: Optional[_ir.Context] = None) -> PassManager: ... - def run(self, module: _ir.Module) -> None: ... + def run(self, module: _ir._OperationBase) -> None: ... @property def _CAPIPtr(self) -> object: ... diff --git a/mlir/python/mlir/dialects/_structured_transform_ops_ext.py b/mlir/python/mlir/dialects/_structured_transform_ops_ext.py index c5134b6e718f3..fd3dbca7c5a60 100644 --- a/mlir/python/mlir/dialects/_structured_transform_ops_ext.py +++ b/mlir/python/mlir/dialects/_structured_transform_ops_ext.py @@ -366,7 +366,7 @@ class MaskedVectorizeOp: def __init__( self, target: Union[Operation, OpView, Value], - vector_sizes: Union[DynamicIndexList, ArrayAttr], + vector_sizes: Optional[Union[DynamicIndexList, ArrayAttr]] = None, *, vectorize_nd_extract: Optional[bool] = None, scalable_sizes: OptionalBoolList = None, @@ -374,7 +374,13 @@ def __init__( loc=None, ip=None, ): - if scalable_sizes is None and static_vector_sizes is None: + if ( + scalable_sizes is None + and static_vector_sizes is None + and vector_sizes is None + ): + dynamic_vector_sizes = [] + elif scalable_sizes is None and static_vector_sizes is None: ( dynamic_vector_sizes, static_vector_sizes, diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c index 5725d05a3e132..c031e61945d03 100644 --- a/mlir/test/CAPI/ir.c +++ b/mlir/test/CAPI/ir.c @@ -487,6 +487,13 @@ static void printFirstOfEach(MlirContext ctx, MlirOperation operation) { // CHECK: Op print with all flags: %{{.*}} = "arith.constant"() <{value = 0 : index}> {elts = dense_resource<__elided__> : tensor<4xi32>} : () -> index loc(unknown) // clang-format on + MlirAsmState state = mlirAsmStateCreateForOperation(parentOperation, flags); + fprintf(stderr, "With state: |"); + mlirValuePrintAsOperand(value, state, printToStderr, NULL); + // CHECK: With state: |%0| + fprintf(stderr, "|\n"); + mlirAsmStateDestroy(state); + mlirOpPrintingFlagsDestroy(flags); } diff --git a/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir b/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir index 0818791b98471..f129cc8ce84ec 100644 --- a/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/wmma-ops-to-spirv-khr-coop-matrix.mlir @@ -69,12 +69,106 @@ module attributes { -> !gpu.mma_matrix<16x16xf16, "COp"> %i = arith.constant 0 : index - // CHECK: spirv.KHR.CooperativeMatrixStore {{%.+}}, %[[MAD]], %{{.+}}, + // CHECK: spirv.KHR.CooperativeMatrixStore %{{.+}}, %[[MAD]], %{{.+}}, gpu.subgroup_mma_store_matrix %D, %ptr[%i,%i] {leadDimension = 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16, #spirv.storage_class> // CHECK: spirv.Return gpu.return } + // CHECK-LABEL: spirv.func @gpu_wmma_constant_op + gpu.func @gpu_wmma_constant_op(%ptr: memref<16x16xf16, #spirv.storage_class>) kernel + attributes {spirv.entry_point_abi = #spirv.entry_point_abi} { + // CHECK: %[[CST1F:.+]] = spirv.Constant 1.000000e+00 : f16 + %cst = arith.constant 1.0 : f16 + // CHECK: %[[MAT:.+]] = spirv.CompositeConstruct %[[CST1F]] : + // CHECK-SAME: (f16) -> !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + %C = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf16, "COp"> + + %i = arith.constant 0 : index + // CHECK: spirv.KHR.CooperativeMatrixStore %{{.+}}, %[[MAT]], %{{.+}}, + gpu.subgroup_mma_store_matrix %C, %ptr[%i,%i] {leadDimension = 32 : index} : + !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16, #spirv.storage_class> + // CHECK: spirv.Return + gpu.return + } + + // CHECK-LABEL: spirv.func @gpu_wmma_elementwise_op_default + // CHECK-SAME: !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + // CHECK-SAME: !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + gpu.func @gpu_wmma_elementwise_op_default(%A: !gpu.mma_matrix<16x16xf16, "COp">, + %B: !gpu.mma_matrix<16x16xf16, "COp">, + %ptr: memref<16x16xf32, #spirv.storage_class>) kernel + attributes {spirv.entry_point_abi = #spirv.entry_point_abi} { + // CHECK: {{%.*}} = spirv.FAdd {{%.*}}, {{%.*}} : !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + %C = gpu.subgroup_mma_elementwise addf %A, %B : + (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> + // CHECK: {{%.*}} = spirv.FNegate {{%.*}} : !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + %D = gpu.subgroup_mma_elementwise negatef %C : + (!gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> + // CHECK: {{%.*}} = spirv.FDiv {{%.*}}, {{%.*}} : !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + %E = gpu.subgroup_mma_elementwise divf %D, %A : + (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> + // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : + // CHECK-SAME: !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> to !spirv.coopmatrix<16x16xf32, Subgroup, MatrixAcc> + %F = gpu.subgroup_mma_elementwise extf %E : + (!gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf32, "COp"> + + %i = arith.constant 0 : index + // CHECK: spirv.KHR.CooperativeMatrixStore %{{.+}}, %{{.+}}, %{{.+}}, + gpu.subgroup_mma_store_matrix %F, %ptr[%i,%i] {leadDimension = 32 : index} : + !gpu.mma_matrix<16x16xf32, "COp">, memref<16x16xf32, #spirv.storage_class> + // CHECK: spirv.Return + gpu.return + } + + // CHECK-LABEL: spirv.func @gpu_wmma_elementwise_op_matrix_times_scalar + // CHECK-SAME: %[[A:.+]]: !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + // CHECK-SAME: %[[S:.+]]: f16 + gpu.func @gpu_wmma_elementwise_op_matrix_times_scalar( + %A: !gpu.mma_matrix<16x16xf16, "COp">, %scalar: f16, + %ptr: memref<16x16xf16, #spirv.storage_class>) kernel + attributes {spirv.entry_point_abi = #spirv.entry_point_abi} { + %i = arith.constant 0 : index + + %B = gpu.subgroup_mma_constant_matrix %scalar : !gpu.mma_matrix<16x16xf16, "COp"> + // CHECK: %[[C:.+]] = spirv.MatrixTimesScalar %[[A]], %[[S]] : !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc>, f16 + // CHECK: spirv.KHR.CooperativeMatrixStore %{{.+}}, %[[C]], %{{.+}}, + %C = gpu.subgroup_mma_elementwise mulf %A, %B : + (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> + gpu.subgroup_mma_store_matrix %C, %ptr[%i,%i] {leadDimension = 32 : index} : + !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16, #spirv.storage_class> + + // CHECK: %[[D:.+]] = spirv.MatrixTimesScalar %[[C]], %[[S]] : !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc>, f16 + // CHECK: spirv.KHR.CooperativeMatrixStore %{{.+}}, %[[D]], %{{.+}}, + %D = gpu.subgroup_mma_elementwise mulf %B, %C : + (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> + gpu.subgroup_mma_store_matrix %D, %ptr[%i,%i] {leadDimension = 32 : index} : + !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16, #spirv.storage_class> + // CHECK: spirv.Return + gpu.return + } + + // CHECK-LABEL: spirv.func @gpu_wmma_elementwise_op_matrix_plus_scalar + // CHECK-SAME: %[[A:.+]]: !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + // CHECK-SAME: %[[S:.+]]: f16 + gpu.func @gpu_wmma_elementwise_op_matrix_plus_scalar( + %A : !gpu.mma_matrix<16x16xf16, "COp">, %scalar : f16, + %ptr: memref<16x16xf16, #spirv.storage_class>) kernel + attributes {spirv.entry_point_abi = #spirv.entry_point_abi} { + %i = arith.constant 0 : index + + // CHECK: %[[SM:.+]] = spirv.CompositeConstruct %[[S]] : (f16) -> !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + %B = gpu.subgroup_mma_constant_matrix %scalar : !gpu.mma_matrix<16x16xf16, "COp"> + // CHECK: %[[C:.+]] = spirv.FAdd %[[A]], %[[SM]] : !spirv.coopmatrix<16x16xf16, Subgroup, MatrixAcc> + %C = gpu.subgroup_mma_elementwise addf %A, %B : + (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> + + // CHECK: spirv.KHR.CooperativeMatrixStore %{{.+}}, %[[C]], %{{.+}}, + gpu.subgroup_mma_store_matrix %C, %ptr[%i,%i] {leadDimension = 32 : index} : + !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16, #spirv.storage_class> + // CHECK: spirv.Return + gpu.return + } } } diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir index fedbcd401d44c..1df27dd9957e5 100644 --- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir +++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir @@ -193,13 +193,26 @@ func.func @task_depend(%arg0: !llvm.ptr) { // CHECK-LABEL: @_QPomp_target_data // CHECK: (%[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: !llvm.ptr, %[[ARG3:.*]]: !llvm.ptr) -// CHECK: omp.target_enter_data map((to -> %[[ARG0]] : !llvm.ptr), (to -> %[[ARG1]] : !llvm.ptr), (always, alloc -> %[[ARG2]] : !llvm.ptr)) -// CHECK: omp.target_exit_data map((from -> %[[ARG0]] : !llvm.ptr), (from -> %[[ARG1]] : !llvm.ptr), (release -> %[[ARG2]] : !llvm.ptr), (always, delete -> %[[ARG3]] : !llvm.ptr)) -// CHECK: llvm.return +// CHECK: %[[MAP0:.*]] = omp.map_info var_ptr(%[[ARG0]] : !llvm.ptr) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: %[[MAP1:.*]] = omp.map_info var_ptr(%[[ARG1]] : !llvm.ptr) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: %[[MAP2:.*]] = omp.map_info var_ptr(%[[ARG2]] : !llvm.ptr) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: omp.target_enter_data map_entries(%[[MAP0]], %[[MAP1]], %[[MAP2]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) +// CHECK: %[[MAP3:.*]] = omp.map_info var_ptr(%[[ARG0]] : !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: %[[MAP4:.*]] = omp.map_info var_ptr(%[[ARG1]] : !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: %[[MAP5:.*]] = omp.map_info var_ptr(%[[ARG2]] : !llvm.ptr) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: %[[MAP6:.*]] = omp.map_info var_ptr(%[[ARG3]] : !llvm.ptr) map_clauses(always, delete) capture(ByRef) -> !llvm.ptr {name = ""} +// CHECK: omp.target_exit_data map_entries(%[[MAP3]], %[[MAP4]], %[[MAP5]], %[[MAP6]] : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) llvm.func @_QPomp_target_data(%a : !llvm.ptr, %b : !llvm.ptr, %c : !llvm.ptr, %d : !llvm.ptr) { - omp.target_enter_data map((to -> %a : !llvm.ptr), (to -> %b : !llvm.ptr), (always, alloc -> %c : !llvm.ptr)) - omp.target_exit_data map((from -> %a : !llvm.ptr), (from -> %b : !llvm.ptr), (release -> %c : !llvm.ptr), (always, delete -> %d : !llvm.ptr)) + %0 = omp.map_info var_ptr(%a : !llvm.ptr) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""} + %1 = omp.map_info var_ptr(%b : !llvm.ptr) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""} + %2 = omp.map_info var_ptr(%c : !llvm.ptr) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_enter_data map_entries(%0, %1, %2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {} + %3 = omp.map_info var_ptr(%a : !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} + %4 = omp.map_info var_ptr(%b : !llvm.ptr) map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""} + %5 = omp.map_info var_ptr(%c : !llvm.ptr) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""} + %6 = omp.map_info var_ptr(%d : !llvm.ptr) map_clauses(always, delete) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_exit_data map_entries(%3, %4, %5, %6 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {} llvm.return } @@ -207,7 +220,8 @@ llvm.func @_QPomp_target_data(%a : !llvm.ptr, %b : !llvm.ptr, %c : !ll // CHECK-LABEL: @_QPomp_target_data_region // CHECK: (%[[ARG0:.*]]: !llvm.ptr>, %[[ARG1:.*]]: !llvm.ptr) { -// CHECK: omp.target_data map((tofrom -> %[[ARG0]] : !llvm.ptr>)) { +// CHECK: %[[MAP_0:.*]] = omp.map_info var_ptr(%[[ARG0]] : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr> {name = ""} +// CHECK: omp.target_data map_entries(%[[MAP_0]] : !llvm.ptr>) { // CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(10 : i32) : i32 // CHECK: llvm.store %[[VAL_1]], %[[ARG1]] : !llvm.ptr // CHECK: omp.terminator @@ -215,9 +229,10 @@ llvm.func @_QPomp_target_data(%a : !llvm.ptr, %b : !llvm.ptr, %c : !ll // CHECK: llvm.return llvm.func @_QPomp_target_data_region(%a : !llvm.ptr>, %i : !llvm.ptr) { - omp.target_data map((tofrom -> %a : !llvm.ptr>)) { - %1 = llvm.mlir.constant(10 : i32) : i32 - llvm.store %1, %i : !llvm.ptr + %1 = omp.map_info var_ptr(%a : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target_data map_entries(%1 : !llvm.ptr>) { + %2 = llvm.mlir.constant(10 : i32) : i32 + llvm.store %2, %i : !llvm.ptr omp.terminator } llvm.return @@ -229,7 +244,8 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr>, %i : !ll // CHECK: %[[ARG_0:.*]]: !llvm.ptr>, // CHECK: %[[ARG_1:.*]]: !llvm.ptr) { // CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(64 : i32) : i32 -// CHECK: omp.target thread_limit(%[[VAL_0]] : i32) map((tofrom -> %[[ARG_0]] : !llvm.ptr>)) { +// CHECK: %[[MAP:.*]] = omp.map_info var_ptr(%[[ARG_0]] : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr> {name = ""} +// CHECK: omp.target thread_limit(%[[VAL_0]] : i32) map_entries(%[[MAP]] : !llvm.ptr>) { // CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(10 : i32) : i32 // CHECK: llvm.store %[[VAL_1]], %[[ARG_1]] : !llvm.ptr // CHECK: omp.terminator @@ -239,9 +255,10 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr>, %i : !ll llvm.func @_QPomp_target(%a : !llvm.ptr>, %i : !llvm.ptr) { %0 = llvm.mlir.constant(64 : i32) : i32 - omp.target thread_limit(%0 : i32) map((tofrom -> %a : !llvm.ptr>)) { - %1 = llvm.mlir.constant(10 : i32) : i32 - llvm.store %1, %i : !llvm.ptr + %1 = omp.map_info var_ptr(%a : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target thread_limit(%0 : i32) map_entries(%1 : !llvm.ptr>) { + %2 = llvm.mlir.constant(10 : i32) : i32 + llvm.store %2, %i : !llvm.ptr omp.terminator } llvm.return @@ -415,3 +432,46 @@ llvm.func @sub_() { } llvm.return } + +// ----- + +// CHECK-LABEL: llvm.func @_QPtarget_map_with_bounds( +// CHECK: %[[ARG_0:.*]]: !llvm.ptr, +// CHECK: %[[ARG_1:.*]]: !llvm.ptr>, +// CHECK: %[[ARG_2:.*]]: !llvm.ptr>) { +// CHECK: %[[C_01:.*]] = llvm.mlir.constant(4 : index) : i64 +// CHECK: %[[C_02:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[C_03:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[C_04:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[BOUNDS0:.*]] = omp.bounds lower_bound(%[[C_02]] : i64) upper_bound(%[[C_01]] : i64) stride(%[[C_04]] : i64) start_idx(%[[C_04]] : i64) +// CHECK: %[[MAP0:.*]] = omp.map_info var_ptr(%[[ARG_1]] : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !llvm.ptr> {name = ""} +// CHECK: %[[C_11:.*]] = llvm.mlir.constant(4 : index) : i64 +// CHECK: %[[C_12:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[C_13:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[C_14:.*]] = llvm.mlir.constant(1 : index) : i64 +// CHECK: %[[BOUNDS1:.*]] = omp.bounds lower_bound(%[[C_12]] : i64) upper_bound(%[[C_11]] : i64) stride(%[[C_14]] : i64) start_idx(%[[C_14]] : i64) +// CHECK: %[[MAP1:.*]] = omp.map_info var_ptr(%[[ARG_2]] : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !llvm.ptr> {name = ""} +// CHECK: omp.target map_entries(%[[MAP0]], %[[MAP1]] : !llvm.ptr>, !llvm.ptr>) { +// CHECK: omp.terminator +// CHECK: } +// CHECK: llvm.return +// CHECK:} + +llvm.func @_QPtarget_map_with_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr>, %arg2: !llvm.ptr>) { + %0 = llvm.mlir.constant(4 : index) : i64 + %1 = llvm.mlir.constant(1 : index) : i64 + %2 = llvm.mlir.constant(1 : index) : i64 + %3 = llvm.mlir.constant(1 : index) : i64 + %4 = omp.bounds lower_bound(%1 : i64) upper_bound(%0 : i64) stride(%3 : i64) start_idx(%3 : i64) + %5 = omp.map_info var_ptr(%arg1 : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr> {name = ""} + %6 = llvm.mlir.constant(4 : index) : i64 + %7 = llvm.mlir.constant(1 : index) : i64 + %8 = llvm.mlir.constant(1 : index) : i64 + %9 = llvm.mlir.constant(1 : index) : i64 + %10 = omp.bounds lower_bound(%7 : i64) upper_bound(%6 : i64) stride(%9 : i64) start_idx(%9 : i64) + %11 = omp.map_info var_ptr(%arg2 : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) bounds(%10) -> !llvm.ptr> {name = ""} + omp.target map_entries(%5, %11 : !llvm.ptr>, !llvm.ptr>) { + omp.terminator + } + llvm.return +} \ No newline at end of file diff --git a/mlir/test/Dialect/Linalg/transform-op-tile.mlir b/mlir/test/Dialect/Linalg/transform-op-tile.mlir index ce2a3d6ca9c58..9df19632506a7 100644 --- a/mlir/test/Dialect/Linalg/transform-op-tile.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-tile.mlir @@ -190,16 +190,16 @@ transform.sequence failures(propagate) { // ----- // CHECK-LABEL: func.func @scalable_and_fixed_length_tile -// CHECK: %[[STEP_0:.*]] = arith.constant 4 : index -// CHECK: %[[STEP_1:.*]] = arith.constant 4 : index // CHECK: %[[C4:.*]] = arith.constant 4 : index // CHECK: %[[VS:.*]] = vector.vscale // CHECK: %[[STEP_2:.*]] = arith.muli %[[C4]], %[[VS]] : index // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[STEP_0:.*]] = arith.constant 4 : index // CHECK: scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C128]] step %[[STEP_0]] // CHECK: %[[C0_1:.*]] = arith.constant 0 : index // CHECK: %[[C128_1:.*]] = arith.constant 128 : index +// CHECK: %[[STEP_1:.*]] = arith.constant 4 : index // CHECK: scf.for %[[VAL_16:.*]] = %[[C0_1]] to %[[C128_1]] step %[[STEP_1]] // CHECK: %[[C0_2:.*]] = arith.constant 0 : index // CHECK: %[[C128_2:.*]] = arith.constant 128 : index diff --git a/mlir/test/Dialect/MLProgram/pipeline-globals.mlir b/mlir/test/Dialect/MLProgram/pipeline-globals.mlir new file mode 100644 index 0000000000000..a5c9b3e890558 --- /dev/null +++ b/mlir/test/Dialect/MLProgram/pipeline-globals.mlir @@ -0,0 +1,246 @@ +// RUN: mlir-opt -split-input-file -pass-pipeline="builtin.module(mlprogram-pipeline-globals)" --allow-unregistered-dialect %s + +// CHECK-LABEL: @global_variable +ml_program.global private mutable @global_variable(dense<4> : tensor<4xi32>) : tensor<4xi32> + +// CHECK-LABEL: @global_double_load +func.func @global_double_load() { + // CHECK: %[[LOAD:.+]] = ml_program.global_load @global_variable + // CHECK-NOT: ml_program.global_load @global_variable + %0 = ml_program.global_load @global_variable : tensor<4xi32> + %1 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]], %[[LOAD]]) + %2 = "unregistered.dummy"(%0, %1) : (tensor<4xi32>, tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %2 : tensor<4xi32> + func.return +} + +// ----- + +// CHECK-LABEL: @global_variable +ml_program.global private mutable @global_variable(dense<4> : tensor<4xi32>) : tensor<4xi32> + +// CHECK-LABEL: @global_double_store +func.func @global_double_store() { + // CHECK: %[[LOAD:.+]] = ml_program.global_load @global_variable + %0 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %1 = "unregistered.dummy"(%0) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %1 : tensor<4xi32> + + // CHECK-NOT: ml_program.global_store + ml_program.global_store @global_variable = %1 : tensor<4xi32> + func.return +} + +// ----- + +// CHECK-LABEL: @global_variable +ml_program.global private mutable @global_variable(dense<4> : tensor<4xi32>) : tensor<4xi32> + +// CHECK-LABEL: @global_store_load +func.func @global_store_load() { + // CHECK: %[[LOAD:.+]] = ml_program.global_load @global_variable + %0 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + // CHECK: %[[DUMMY2:.+]] = "unregistered.dummy"(%[[DUMMY2]]) + %1 = "unregistered.dummy"(%0) : (tensor<4xi32>) -> (tensor<4xi32>) + ml_program.global_store @global_variable = %1 : tensor<4xi32> + %2 = ml_program.global_load @global_variable : tensor<4xi32> + %3 = "unregistered.dummy"(%2) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY2]] + ml_program.global_store @global_variable = %3 : tensor<4xi32> + func.return +} + +// ----- + +// CHECK-LABEL: @global_variable +ml_program.global private mutable @global_variable(dense<4> : tensor<4xi32>) : tensor<4xi32> + +// CHECK-LABEL: @global_store_load_region +func.func @global_store_load_region() { + // CHECK: %[[LOAD:.+]] = ml_program.global_load @global_variable + %0 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %1 = "unregistered.dummy"(%0) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %1 : tensor<4xi32> + + // CHECK: "unregistered.dummy2" + "unregistered.dummy2"() ({ + ^bb(): + %cst = arith.constant dense<0> : tensor<4xi32> + // CHECK: ml_program.global_store @global_variable + ml_program.global_store @global_variable = %cst : tensor<4xi32> + "unregistered.terminator"() : () -> () + }) : () -> () + + // CHECK: %[[LOAD:.+]] ml_program.global_load @global_variable + %2 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY2:.+]] = "unregistered.dummy"(%[[LOAD]]) + %3 = "unregistered.dummy"(%2) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY2]] + ml_program.global_store @global_variable = %3 : tensor<4xi32> + func.return +} + +// ----- + +// CHECK-LABEL: @global_variable +ml_program.global private mutable @global_variable(dense<4> : tensor<4xi32>) : tensor<4xi32> + +// CHECK-LABEL: @interrupt +func.func @interrupt() { + %cst = arith.constant dense<0> : tensor<4xi32> + // CHECK: ml_program.global_store + ml_program.global_store @global_variable = %cst : tensor<4xi32> + func.return +} + +// CHECK-LABEL: @call_global_store +func.func @call_global_store() { + // CHECK: %[[LOAD:.+]] = ml_program.global_load @global_variable + %0 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %1 = "unregistered.dummy"(%0) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %1 : tensor<4xi32> + call @interrupt() : () -> () + + // CHECK: %[[LOAD:.+]] ml_program.global_load @global_variable + %2 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %3 = "unregistered.dummy"(%2) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %3 : tensor<4xi32> + func.return +} + + +// ----- + +// CHECK-LABEL: @global_variable +ml_program.global private mutable @global_variable(dense<4> : tensor<4xi32>) : tensor<4xi32> + +// CHECK-LABEL: @interrupt_indirect +func.func @interrupt_indirect() { + %cst = arith.constant dense<0> : tensor<4xi32> + // CHECK: ml_program.global_store + ml_program.global_store @global_variable = %cst : tensor<4xi32> + func.return +} + +// CHECK-LABEL: @interrupt +func.func @interrupt() { + call @interrupt_indirect() : () -> () + func.return +} + +// CHECK-LABEL: @call_indirect_store +func.func @call_indirect_store() { + // CHECK: %[[LOAD:.+]] = ml_program.global_load @global_variable + %0 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %1 = "unregistered.dummy"(%0) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %1 : tensor<4xi32> + call @interrupt() : () -> () + + // CHECK: %[[LOAD:.+]] ml_program.global_load @global_variable + %2 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %3 = "unregistered.dummy"(%2) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %3 : tensor<4xi32> + func.return +} + + +// ----- + +// CHECK-LABEL: @global_variable +ml_program.global private mutable @global_variable(dense<4> : tensor<4xi32>) : tensor<4xi32> + +// CHECK-LABEL: @interrupt_indirect +func.func @interrupt_indirect() -> tensor<4xi32> { + // CHECK: ml_program.global_load + %0 = ml_program.global_load @global_variable : tensor<4xi32> + func.return %0 : tensor<4xi32> +} + +// CHECK-LABEL: @interrupt +func.func @interrupt() { + %0 = call @interrupt_indirect() : () -> (tensor<4xi32>) + "unregistered.dummy"(%0) : (tensor<4xi32>) -> () + func.return +} + +// CHECK-LABEL: @call_indirect_load +func.func @call_indirect_load() { + // CHECK: %[[LOAD:.+]] = ml_program.global_load @global_variable + %0 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %1 = "unregistered.dummy"(%0) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %1 : tensor<4xi32> + call @interrupt() : () -> () + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %2 = ml_program.global_load @global_variable : tensor<4xi32> + %3 = "unregistered.dummy"(%2) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %3 : tensor<4xi32> + func.return +} + +// ----- + +// CHECK-LABEL: @global_variable +ml_program.global private mutable @global_variable(dense<4> : tensor<4xi32>) : tensor<4xi32> + +// CHECK-LABEL: @call_recursive +func.func @call_recursive() { + // CHECK: %[[LOAD:.+]] = ml_program.global_load @global_variable + %0 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %1 = "unregistered.dummy"(%0) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %1 : tensor<4xi32> + call @call_recursive() : () -> () + + // CHECK: %[[LOAD:.+]] ml_program.global_load @global_variable + %2 = ml_program.global_load @global_variable : tensor<4xi32> + + // CHECK: %[[DUMMY:.+]] = "unregistered.dummy"(%[[LOAD]]) + %3 = "unregistered.dummy"(%2) : (tensor<4xi32>) -> (tensor<4xi32>) + + // CHECK: ml_program.global_store @global_variable %[[DUMMY]] + ml_program.global_store @global_variable = %3 : tensor<4xi32> + func.return +} diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index a3552a781669f..c8025249e2700 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -1615,16 +1615,18 @@ func.func @omp_threadprivate() { // ----- func.func @omp_target(%map1: memref) { + %mapv = omp.map_info var_ptr(%map1 : memref) map_clauses(delete) capture(ByRef) -> memref {name = ""} // expected-error @below {{to, from, tofrom and alloc map types are permitted}} - omp.target map((delete -> %map1 : memref)){} + omp.target map_entries(%mapv : memref){} return } // ----- func.func @omp_target_data(%map1: memref) { + %mapv = omp.map_info var_ptr(%map1 : memref) map_clauses(delete) capture(ByRef) -> memref {name = ""} // expected-error @below {{to, from, tofrom and alloc map types are permitted}} - omp.target_data map((delete -> %map1 : memref)){} + omp.target_data map_entries(%mapv : memref){} return } @@ -1639,16 +1641,18 @@ func.func @omp_target_data() { // ----- func.func @omp_target_enter_data(%map1: memref) { + %mapv = omp.map_info var_ptr(%map1 : memref) map_clauses(from) capture(ByRef) -> memref {name = ""} // expected-error @below {{to and alloc map types are permitted}} - omp.target_enter_data map((from -> %map1 : memref)){} + omp.target_enter_data map_entries(%mapv : memref){} return } // ----- func.func @omp_target_exit_data(%map1: memref) { + %mapv = omp.map_info var_ptr(%map1 : memref) map_clauses(to) capture(ByRef) -> memref {name = ""} // expected-error @below {{from, release and delete map types are permitted}} - omp.target_exit_data map((to -> %map1 : memref)){} + omp.target_exit_data map_entries(%mapv : memref){} return } diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index be59defd27d03..13cbea6c9923c 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -490,12 +490,18 @@ func.func @omp_target(%if_cond : i1, %device : si32, %num_threads : i32, %map1: }) {nowait, operandSegmentSizes = array} : ( i1, si32, i32 ) -> () // Test with optional map clause. - // CHECK: omp.target map((tofrom -> %{{.*}} : memref), (alloc -> %{{.*}} : memref)) { - omp.target map((tofrom -> %map1 : memref), (alloc -> %map2 : memref)){} - - // CHECK: omp.target map((to -> %{{.*}} : memref), (always, from -> %{{.*}} : memref)) { - omp.target map((to -> %map1 : memref), (always, from -> %map2 : memref)){} - + // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_1:.*]] : memref) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} + // CHECK: %[[MAP_B:.*]] = omp.map_info var_ptr(%[[VAL_2:.*]] : memref) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + // CHECK: omp.target map_entries(%[[MAP_A]], %[[MAP_B]] : memref, memref) { + %mapv1 = omp.map_info var_ptr(%map1 : memref) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} + %mapv2 = omp.map_info var_ptr(%map2 : memref) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + omp.target map_entries(%mapv1, %mapv2 : memref, memref){} + // CHECK: %[[MAP_C:.*]] = omp.map_info var_ptr(%[[VAL_1:.*]] : memref) map_clauses(to) capture(ByRef) -> memref {name = ""} + // CHECK: %[[MAP_D:.*]] = omp.map_info var_ptr(%[[VAL_2:.*]] : memref) map_clauses(always, from) capture(ByRef) -> memref {name = ""} + // CHECK: omp.target map_entries(%[[MAP_C]], %[[MAP_D]] : memref, memref) { + %mapv3 = omp.map_info var_ptr(%map1 : memref) map_clauses(to) capture(ByRef) -> memref {name = ""} + %mapv4 = omp.map_info var_ptr(%map2 : memref) map_clauses(always, from) capture(ByRef) -> memref {name = ""} + omp.target map_entries(%mapv3, %mapv4 : memref, memref) {} // CHECK: omp.barrier omp.barrier @@ -504,20 +510,32 @@ func.func @omp_target(%if_cond : i1, %device : si32, %num_threads : i32, %map1: // CHECK-LABEL: omp_target_data func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref, %device_addr: memref, %map1: memref, %map2: memref) -> () { - // CHECK: omp.target_data if(%[[VAL_0:.*]] : i1) device(%[[VAL_1:.*]] : si32) map((always, from -> %[[VAL_2:.*]] : memref)) - omp.target_data if(%if_cond : i1) device(%device : si32) map((always, from -> %map1 : memref)){} - - // CHECK: omp.target_data map((close, present, to -> %[[VAL_2:.*]] : memref)) use_device_ptr(%[[VAL_3:.*]] : memref) use_device_addr(%[[VAL_4:.*]] : memref) - omp.target_data map((close, present, to -> %map1 : memref)) use_device_ptr(%device_ptr : memref) use_device_addr(%device_addr : memref) {} - - // CHECK: omp.target_data map((tofrom -> %[[VAL_2]] : memref), (alloc -> %[[VAL_5:.*]] : memref)) - omp.target_data map((tofrom -> %map1 : memref), (alloc -> %map2 : memref)){} - - // CHECK: omp.target_enter_data if(%[[VAL_0]] : i1) device(%[[VAL_1]] : si32) nowait map((alloc -> %[[VAL_2]] : memref)) - omp.target_enter_data if(%if_cond : i1) device(%device : si32) nowait map((alloc -> %map1 : memref)) - - // CHECK: omp.target_exit_data if(%[[VAL_0]] : i1) device(%[[VAL_1]] : si32) nowait map((release -> %[[VAL_5]] : memref)) - omp.target_exit_data if(%if_cond : i1) device(%device : si32) nowait map((release -> %map2 : memref)) + // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_2:.*]] : memref) map_clauses(always, from) capture(ByRef) -> memref {name = ""} + // CHECK: omp.target_data if(%[[VAL_0:.*]] : i1) device(%[[VAL_1:.*]] : si32) map_entries(%[[MAP_A]] : memref) + %mapv1 = omp.map_info var_ptr(%map1 : memref) map_clauses(always, from) capture(ByRef) -> memref {name = ""} + omp.target_data if(%if_cond : i1) device(%device : si32) map_entries(%mapv1 : memref){} + + // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_2:.*]] : memref) map_clauses(close, present, to) capture(ByRef) -> memref {name = ""} + // CHECK: omp.target_data map_entries(%[[MAP_A]] : memref) use_device_ptr(%[[VAL_3:.*]] : memref) use_device_addr(%[[VAL_4:.*]] : memref) + %mapv2 = omp.map_info var_ptr(%map1 : memref) map_clauses(close, present, to) capture(ByRef) -> memref {name = ""} + omp.target_data map_entries(%mapv2 : memref) use_device_ptr(%device_ptr : memref) use_device_addr(%device_addr : memref) {} + + // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_1:.*]] : memref) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} + // CHECK: %[[MAP_B:.*]] = omp.map_info var_ptr(%[[VAL_2:.*]] : memref) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + // CHECK: omp.target_data map_entries(%[[MAP_A]], %[[MAP_B]] : memref, memref) + %mapv3 = omp.map_info var_ptr(%map1 : memref) map_clauses(tofrom) capture(ByRef) -> memref {name = ""} + %mapv4 = omp.map_info var_ptr(%map2 : memref) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + omp.target_data map_entries(%mapv3, %mapv4 : memref, memref) {} + + // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_3:.*]] : memref) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + // CHECK: omp.target_enter_data if(%[[VAL_0:.*]] : i1) device(%[[VAL_1:.*]] : si32) nowait map_entries(%[[MAP_A]] : memref) + %mapv5 = omp.map_info var_ptr(%map1 : memref) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + omp.target_enter_data if(%if_cond : i1) device(%device : si32) nowait map_entries(%mapv5 : memref) + + // CHECK: %[[MAP_A:.*]] = omp.map_info var_ptr(%[[VAL_3:.*]] : memref) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + // CHECK: omp.target_exit_data if(%[[VAL_0:.*]] : i1) device(%[[VAL_1:.*]] : si32) nowait map_entries(%[[MAP_A]] : memref) + %mapv6 = omp.map_info var_ptr(%map2 : memref) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref {name = ""} + omp.target_exit_data if(%if_cond : i1) device(%device : si32) nowait map_entries(%mapv6 : memref) return } @@ -2007,3 +2025,51 @@ atomic { llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 omp.yield } + +// CHECK-LABEL: omp_targets_with_map_bounds +// CHECK-SAME: (%[[ARG0:.*]]: !llvm.ptr>, %[[ARG1:.*]]: !llvm.ptr>) +func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr>, %arg1: !llvm.ptr>) -> () { + // CHECK: %[[C_00:.*]] = llvm.mlir.constant(4 : index) : i64 + // CHECK: %[[C_01:.*]] = llvm.mlir.constant(1 : index) : i64 + // CHECK: %[[C_02:.*]] = llvm.mlir.constant(1 : index) : i64 + // CHECK: %[[C_03:.*]] = llvm.mlir.constant(1 : index) : i64 + // CHECK: %[[BOUNDS0:.*]] = omp.bounds lower_bound(%[[C_01]] : i64) upper_bound(%[[C_00]] : i64) stride(%[[C_02]] : i64) start_idx(%[[C_03]] : i64) + // CHECK: %[[MAP0:.*]] = omp.map_info var_ptr(%[[ARG0]] : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !llvm.ptr> {name = ""} + %0 = llvm.mlir.constant(4 : index) : i64 + %1 = llvm.mlir.constant(1 : index) : i64 + %2 = llvm.mlir.constant(1 : index) : i64 + %3 = llvm.mlir.constant(1 : index) : i64 + %4 = omp.bounds lower_bound(%1 : i64) upper_bound(%0 : i64) stride(%2 : i64) start_idx(%3 : i64) + + %mapv1 = omp.map_info var_ptr(%arg0 : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr> {name = ""} + // CHECK: %[[C_10:.*]] = llvm.mlir.constant(9 : index) : i64 + // CHECK: %[[C_11:.*]] = llvm.mlir.constant(1 : index) : i64 + // CHECK: %[[C_12:.*]] = llvm.mlir.constant(2 : index) : i64 + // CHECK: %[[C_13:.*]] = llvm.mlir.constant(2 : index) : i64 + // CHECK: %[[BOUNDS1:.*]] = omp.bounds lower_bound(%[[C_11]] : i64) upper_bound(%[[C_10]] : i64) stride(%[[C_12]] : i64) start_idx(%[[C_13]] : i64) + // CHECK: %[[MAP1:.*]] = omp.map_info var_ptr(%[[ARG1]] : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(ByCopy) bounds(%[[BOUNDS1]]) -> !llvm.ptr> {name = ""} + %6 = llvm.mlir.constant(9 : index) : i64 + %7 = llvm.mlir.constant(1 : index) : i64 + %8 = llvm.mlir.constant(2 : index) : i64 + %9 = llvm.mlir.constant(2 : index) : i64 + %10 = omp.bounds lower_bound(%7 : i64) upper_bound(%6 : i64) stride(%8 : i64) start_idx(%9 : i64) + %mapv2 = omp.map_info var_ptr(%arg1 : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(ByCopy) bounds(%10) -> !llvm.ptr> {name = ""} + + // CHECK: omp.target map_entries(%[[MAP0]], %[[MAP1]] : !llvm.ptr>, !llvm.ptr>) + omp.target map_entries(%mapv1, %mapv2 : !llvm.ptr>, !llvm.ptr>){} + + // CHECK: omp.target_data map_entries(%[[MAP0]], %[[MAP1]] : !llvm.ptr>, !llvm.ptr>) + omp.target_data map_entries(%mapv1, %mapv2 : !llvm.ptr>, !llvm.ptr>){} + + // CHECK: %[[MAP2:.*]] = omp.map_info var_ptr(%[[ARG0]] : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(VLAType) bounds(%[[BOUNDS0]]) -> !llvm.ptr> {name = ""} + // CHECK: omp.target_enter_data map_entries(%[[MAP2]] : !llvm.ptr>) + %mapv3 = omp.map_info var_ptr(%arg0 : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(VLAType) bounds(%4) -> !llvm.ptr> {name = ""} + omp.target_enter_data map_entries(%mapv3 : !llvm.ptr>){} + + // CHECK: %[[MAP3:.*]] = omp.map_info var_ptr(%[[ARG1]] : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(This) bounds(%[[BOUNDS1]]) -> !llvm.ptr> {name = ""} + // CHECK: omp.target_exit_data map_entries(%[[MAP3]] : !llvm.ptr>) + %mapv4 = omp.map_info var_ptr(%arg1 : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(This) bounds(%10) -> !llvm.ptr> {name = ""} + omp.target_exit_data map_entries(%mapv4 : !llvm.ptr>){} + + return +} diff --git a/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir b/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir index 170f851138f82..0036bd5c3310b 100644 --- a/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir +++ b/mlir/test/Dialect/SparseTensor/buffer_rewriting.mlir @@ -75,343 +75,9 @@ func.func @sparse_push_back_inbound(%arg0: index, %arg1: memref, %arg2: f // ----- -// CHECK-LABEL: func.func private @_sparse_partition_1_i8_f32_index( -// CHECK-SAME: %[[VAL_0:.*0]]: index, -// CHECK-SAME: %[[VAL_1:.*1]]: index, -// CHECK-SAME: %[[VAL_2:.*2]]: memref, -// CHECK-SAME: %[[VAL_3:.*3]]: memref, -// CHECK-SAME: %[[VAL_4:.*4]]: memref) -> index { -// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 -// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1000 -// CHECK-DAG: %[[VAL_7:.*]] = arith.constant -1 -// CHECK: %[[VAL_8:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] -// CHECK: %[[VAL_9:.*]] = arith.shrui %[[VAL_8]], %[[VAL_5]] -// CHECK: %[[VAL_10:.*]] = arith.subi %[[VAL_1]], %[[VAL_5]] -// CHECK: %[[VAL_11:.*]] = arith.subi %[[VAL_1]], %[[VAL_0]] -// CHECK: %[[VAL_12:.*]] = arith.cmpi ult, %[[VAL_11]], %[[VAL_6]] -// CHECK: scf.if %[[VAL_12]] { -// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_15:.*]] = arith.cmpi ult, %[[VAL_13]], %[[VAL_14]] -// CHECK: scf.if %[[VAL_15]] { -// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_17]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_16]], %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_19]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_18]], %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_21]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_20]], %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: } -// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_10]]] -// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_24:.*]] = arith.cmpi ult, %[[VAL_22]], %[[VAL_23]] -// CHECK: scf.if %[[VAL_24]] { -// CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_10]]] -// CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_26]], %[[VAL_2]]{{\[}}%[[VAL_10]]] -// CHECK: memref.store %[[VAL_25]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_27:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_10]]] -// CHECK: %[[VAL_28:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_28]], %[[VAL_3]]{{\[}}%[[VAL_10]]] -// CHECK: memref.store %[[VAL_27]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_29:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_10]]] -// CHECK: %[[VAL_30:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_30]], %[[VAL_4]]{{\[}}%[[VAL_10]]] -// CHECK: memref.store %[[VAL_29]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_31:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_32:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_33:.*]] = arith.cmpi ult, %[[VAL_31]], %[[VAL_32]] -// CHECK: scf.if %[[VAL_33]] { -// CHECK: %[[VAL_34:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_35:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_35]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_34]], %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_36:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_37:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_37]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_36]], %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_38:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_39:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_39]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_38]], %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: } -// CHECK: } -// CHECK: } else { -// CHECK: %[[VAL_40:.*]] = arith.addi %[[VAL_9]], %[[VAL_1]] -// CHECK: %[[VAL_41:.*]] = arith.shrui %[[VAL_40]], %[[VAL_5]] -// CHECK: %[[VAL_42:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_43:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_44:.*]] = arith.cmpi ult, %[[VAL_42]], %[[VAL_43]] -// CHECK: scf.if %[[VAL_44]] { -// CHECK: %[[VAL_45:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_46:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_46]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_45]], %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_47:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_48:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_48]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_47]], %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_49:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_50:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_50]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_49]], %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: } -// CHECK: %[[VAL_51:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_52:.*]] = arith.cmpi ult, %[[VAL_51]], %[[VAL_51]] -// CHECK: scf.if %[[VAL_52]] { -// CHECK: %[[VAL_53:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_53]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_53]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_54:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_54]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_54]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_55:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_55]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_55]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_56:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_57:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_58:.*]] = arith.cmpi ult, %[[VAL_56]], %[[VAL_57]] -// CHECK: scf.if %[[VAL_58]] { -// CHECK: %[[VAL_59:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_60:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_60]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_59]], %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_61:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_62:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_62]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_61]], %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_63:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_64:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_64]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_63]], %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: } -// CHECK: } -// CHECK: %[[VAL_65:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_66:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_67:.*]] = arith.cmpi ult, %[[VAL_65]], %[[VAL_66]] -// CHECK: scf.if %[[VAL_67]] { -// CHECK: %[[VAL_68:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_69:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_69]], %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_68]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_70:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_71:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_71]], %[[VAL_3]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_70]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_72:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_73:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_73]], %[[VAL_4]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_72]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_74:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_75:.*]] = arith.cmpi ult, %[[VAL_74]], %[[VAL_74]] -// CHECK: scf.if %[[VAL_75]] { -// CHECK: %[[VAL_76:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_76]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_76]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_77:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_77]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_77]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_78:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_78]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_78]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_79:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_80:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_81:.*]] = arith.cmpi ult, %[[VAL_79]], %[[VAL_80]] -// CHECK: scf.if %[[VAL_81]] { -// CHECK: %[[VAL_82:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_83:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_83]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_82]], %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_84:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_85:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_85]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_84]], %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_86:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_87:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_87]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_86]], %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: %[[VAL_88:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_10]]] -// CHECK: %[[VAL_89:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_90:.*]] = arith.cmpi ult, %[[VAL_88]], %[[VAL_89]] -// CHECK: scf.if %[[VAL_90]] { -// CHECK: %[[VAL_91:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_10]]] -// CHECK: %[[VAL_92:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_92]], %[[VAL_2]]{{\[}}%[[VAL_10]]] -// CHECK: memref.store %[[VAL_91]], %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_93:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_10]]] -// CHECK: %[[VAL_94:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_94]], %[[VAL_3]]{{\[}}%[[VAL_10]]] -// CHECK: memref.store %[[VAL_93]], %[[VAL_3]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_95:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_10]]] -// CHECK: %[[VAL_96:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_96]], %[[VAL_4]]{{\[}}%[[VAL_10]]] -// CHECK: memref.store %[[VAL_95]], %[[VAL_4]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_97:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_98:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_99:.*]] = arith.cmpi ult, %[[VAL_97]], %[[VAL_98]] -// CHECK: scf.if %[[VAL_99]] { -// CHECK: %[[VAL_100:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_101:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_101]], %[[VAL_2]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_100]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_102:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_103:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_103]], %[[VAL_3]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_102]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_104:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_105:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_105]], %[[VAL_4]]{{\[}}%[[VAL_41]]] -// CHECK: memref.store %[[VAL_104]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_106:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_107:.*]] = arith.cmpi ult, %[[VAL_106]], %[[VAL_106]] -// CHECK: scf.if %[[VAL_107]] { -// CHECK: %[[VAL_108:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_108]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_108]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_109:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_109]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_109]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_110:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_110]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_110]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_111:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_112:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_113:.*]] = arith.cmpi ult, %[[VAL_111]], %[[VAL_112]] -// CHECK: scf.if %[[VAL_113]] { -// CHECK: %[[VAL_114:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_115:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_115]], %[[VAL_2]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_114]], %[[VAL_2]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_116:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_117:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_117]], %[[VAL_3]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_116]], %[[VAL_3]]{{\[}}%[[VAL_0]]] -// CHECK: %[[VAL_118:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: %[[VAL_119:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: memref.store %[[VAL_119]], %[[VAL_4]]{{\[}}%[[VAL_9]]] -// CHECK: memref.store %[[VAL_118]], %[[VAL_4]]{{\[}}%[[VAL_0]]] -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: } -// CHECK: %[[VAL_120:.*]]:3 = scf.while (%[[VAL_121:.*]] = %[[VAL_0]], %[[VAL_122:.*]] = %[[VAL_10]], %[[VAL_123:.*]] = %[[VAL_9]]) -// CHECK: %[[VAL_124:.*]] = arith.cmpi ult, %[[VAL_121]], %[[VAL_122]] -// CHECK: scf.condition(%[[VAL_124]]) %[[VAL_121]], %[[VAL_122]], %[[VAL_123]] -// CHECK: } do { -// CHECK: ^bb0(%[[VAL_125:.*]]: index, %[[VAL_126:.*]]: index, %[[VAL_127:.*]]: index) -// CHECK: %[[VAL_128:.*]] = scf.while (%[[VAL_129:.*]] = %[[VAL_125]]) -// CHECK: %[[VAL_130:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_129]]] -// CHECK: %[[VAL_131:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_127]]] -// CHECK: %[[VAL_132:.*]] = arith.cmpi ult, %[[VAL_130]], %[[VAL_131]] -// CHECK: scf.condition(%[[VAL_132]]) %[[VAL_129]] -// CHECK: } do { -// CHECK: ^bb0(%[[VAL_133:.*]]: index): -// CHECK: %[[VAL_134:.*]] = arith.addi %[[VAL_133]], %[[VAL_5]] -// CHECK: scf.yield %[[VAL_134]] -// CHECK: } -// CHECK: %[[VAL_135:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_136:.*]]] -// CHECK: %[[VAL_137:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_127]]] -// CHECK: %[[VAL_138:.*]] = arith.cmpi eq, %[[VAL_135]], %[[VAL_137]] -// CHECK: %[[VAL_139:.*]] = scf.while (%[[VAL_140:.*]] = %[[VAL_126]]) -// CHECK: %[[VAL_141:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_127]]] -// CHECK: %[[VAL_142:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_140]]] -// CHECK: %[[VAL_143:.*]] = arith.cmpi ult, %[[VAL_141]], %[[VAL_142]] -// CHECK: scf.condition(%[[VAL_143]]) %[[VAL_140]] -// CHECK: } do { -// CHECK: ^bb0(%[[VAL_144:.*]]: index): -// CHECK: %[[VAL_145:.*]] = arith.addi %[[VAL_144]], %[[VAL_7]] -// CHECK: scf.yield %[[VAL_145]] -// CHECK: } -// CHECK: %[[VAL_146:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_147:.*]]] -// CHECK: %[[VAL_149:.*]] = arith.cmpi eq, %[[VAL_146]], %[[VAL_137]] -// CHECK: %[[VAL_150:.*]] = arith.cmpi ult, %[[VAL_136]], %[[VAL_147]] -// CHECK: %[[VAL_151:.*]]:3 = scf.if %[[VAL_150]] -// CHECK: %[[VAL_152:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_136]]] -// CHECK: %[[VAL_153:.*]] = memref.load %[[VAL_2]]{{\[}}%[[VAL_147]]] -// CHECK: memref.store %[[VAL_153]], %[[VAL_2]]{{\[}}%[[VAL_136]]] -// CHECK: memref.store %[[VAL_152]], %[[VAL_2]]{{\[}}%[[VAL_147]]] -// CHECK: %[[VAL_154:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_136]]] -// CHECK: %[[VAL_155:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_147]]] -// CHECK: memref.store %[[VAL_155]], %[[VAL_3]]{{\[}}%[[VAL_136]]] -// CHECK: memref.store %[[VAL_154]], %[[VAL_3]]{{\[}}%[[VAL_147]]] -// CHECK: %[[VAL_156:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_136]]] -// CHECK: %[[VAL_157:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_147]]] -// CHECK: memref.store %[[VAL_157]], %[[VAL_4]]{{\[}}%[[VAL_136]]] -// CHECK: memref.store %[[VAL_156]], %[[VAL_4]]{{\[}}%[[VAL_147]]] -// CHECK: %[[VAL_158:.*]] = arith.cmpi eq, %[[VAL_136]], %[[VAL_127]] -// CHECK: %[[VAL_159:.*]] = scf.if %[[VAL_158]] -// CHECK: scf.yield %[[VAL_147]] -// CHECK: } else { -// CHECK: %[[VAL_160:.*]] = arith.cmpi eq, %[[VAL_147]], %[[VAL_127]] -// CHECK: %[[VAL_161:.*]] = arith.select %[[VAL_160]], %[[VAL_136]], %[[VAL_127]] -// CHECK: scf.yield %[[VAL_161]] -// CHECK: } -// CHECK: %[[VAL_162:.*]] = arith.andi %[[VAL_138]], %[[VAL_149]] : i1 -// CHECK: %[[VAL_163:.*]]:2 = scf.if %[[VAL_162]] -// CHECK: %[[VAL_164:.*]] = arith.addi %[[VAL_136]], %[[VAL_5]] -// CHECK: %[[VAL_165:.*]] = arith.subi %[[VAL_147]], %[[VAL_5]] -// CHECK: scf.yield %[[VAL_164]], %[[VAL_165]] -// CHECK: } else { -// CHECK: scf.yield %[[VAL_136]], %[[VAL_147]] -// CHECK: } -// CHECK: scf.yield %[[VAL_166:.*]]#0, %[[VAL_166]]#1, %[[VAL_167:.*]] -// CHECK: } else { -// CHECK: scf.yield %[[VAL_136]], %[[VAL_147]], %[[VAL_127]] -// CHECK: } -// CHECK: scf.yield %[[VAL_168:.*]]#0, %[[VAL_168]]#1, %[[VAL_168]]#2 -// CHECK: } -// CHECK: return %[[VAL_169:.*]]#2 -// CHECK: } - -// CHECK-LABEL: func.func private @_sparse_qsort_1_i8_f32_index( -// CHECK-SAME: %[[L:arg0]]: index, -// CHECK-SAME: %[[H:.*]]: index, -// CHECK-SAME: %[[X0:.*]]: memref, -// CHECK-SAME: %[[Y0:.*]]: memref, -// CHECK-SAME: %[[Y1:.*]]: memref) { -// CHECK: %[[C1:.*]] = arith.constant 1 -// CHECK: scf.while (%[[L2:.*]] = %[[L]], %[[H2:.*]] = %[[H]]) -// CHECK: %[[Lb:.*]] = arith.addi %[[L2]], %[[C1]] -// CHECK: %[[COND:.*]] = arith.cmpi ult, %[[Lb]], %[[H2]] -// CHECK: scf.condition(%[[COND]]) %[[L2]], %[[H2]] -// CHECK: } do { -// CHECK: ^bb0(%[[L3:.*]]: index, %[[H3:.*]]: index) -// CHECK: %[[P:.*]] = func.call @_sparse_partition_1_i8_f32_index(%[[L3]], %[[H3]], %[[X0]], %[[Y0]], %[[Y1]]) -// CHECK: %[[PP1:.*]] = arith.addi %[[P]], %[[C1]] : index -// CHECK: %[[LenL:.*]] = arith.subi %[[P]], %[[L3]] -// CHECK: %[[LenH:.*]] = arith.subi %[[H3]], %[[P]] -// CHECK: %[[Cmp:.*]] = arith.cmpi ule, %[[LenL]], %[[LenH]] -// CHECK: %[[L4:.*]] = arith.select %[[Cmp]], %[[PP1]], %[[L3]] -// CHECK: %[[H4:.*]] = arith.select %[[Cmp]], %[[H3]], %[[P]] -// CHECK: scf.if %[[Cmp]] -// CHECK: func.call @_sparse_qsort_1_i8_f32_index(%[[L3]], %[[P]], %[[X0]], %[[Y0]], %[[Y1]]) -// CHECK: else -// CHECK: func.call @_sparse_qsort_1_i8_f32_index(%[[PP1]], %[[H3]], %[[X0]], %[[Y0]], %[[Y1]]) -// CHECK: scf.yield %[[L4]], %[[H4]] -// CHECK: } -// CHECK: return -// CHECK: } - -// CHECK-LABEL: func.func @sparse_sort_1d2v_quick( -// CHECK-SAME: %[[N:.*]]: index, -// CHECK-SAME: %[[X0:.*]]: memref<10xi8>, -// CHECK-SAME: %[[Y0:.*]]: memref, -// CHECK-SAME: %[[Y1:.*]]: memref<10xindex>) -> (memref<10xi8>, memref, memref<10xindex>) { -// CHECK: %[[C0:.*]] = arith.constant 0 -// CHECK: %[[DX0:.*]] = memref.cast %[[X0]] : memref<10xi8> to memref -// CHECK: %[[DY1:.*]] = memref.cast %[[Y1]] : memref<10xindex> to memref -// CHECK: call @_sparse_qsort_1_i8_f32_index(%[[C0]], %[[N]], %[[DX0]], %[[Y0]], %[[DY1]]) -// CHECK: return %[[X0]], %[[Y0]], %[[Y1]] -// CHECK: } +// CHECK-LABEL: func.func private @_sparse_partition_1_i8_f32_index +// CHECK-LABEL: func.func private @_sparse_qsort_1_i8_f32_index +// CHECK-LABEL: func.func @sparse_sort_1d2v_quick func.func @sparse_sort_1d2v_quick(%arg0: index, %arg1: memref<10xi8>, %arg2: memref, %arg3: memref<10xindex>) -> (memref<10xi8>, memref, memref<10xindex>) { sparse_tensor.sort quick_sort %arg0, %arg1 jointly %arg2, %arg3 : memref<10xi8> jointly memref, memref<10xindex> diff --git a/mlir/test/Dialect/SparseTensor/constant_index_map.mlir b/mlir/test/Dialect/SparseTensor/constant_index_map.mlir index bfb4503edbc4e..9eb5353857901 100644 --- a/mlir/test/Dialect/SparseTensor/constant_index_map.mlir +++ b/mlir/test/Dialect/SparseTensor/constant_index_map.mlir @@ -13,7 +13,7 @@ // CHECK-DAG: %[[VAL_2:.*]] = arith.constant 77 : index // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_5:.*]] = bufferization.alloc_tensor() : tensor<77xi1, #{{.*}}> +// CHECK-DAG: %[[VAL_5:.*]] = tensor.empty() : tensor<77xi1, #{{.*}}> // CHECK-DAG: %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<1x77xi1> // CHECK-DAG: %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<1x77xi1> // CHECK: %[[VAL_8:.*]] = scf.for %[[VAL_9:.*]] = %[[VAL_3]] to %[[VAL_2]] step %[[VAL_4]] iter_args(%[[VAL_10:.*]] = %[[VAL_5]]) -> (tensor<77xi1, #{{.*}}>) { @@ -27,7 +27,7 @@ // CHECK: return %[[VAL_15]] : tensor<77xi1, #{{.*}}> // CHECK: } func.func @main(%arg0: tensor<1x77xi1>, %arg1: tensor<1x77xi1>) -> tensor<77xi1, #SpVec> { - %0 = bufferization.alloc_tensor() : tensor<77xi1, #SpVec> + %0 = tensor.empty() : tensor<77xi1, #SpVec> %1 = linalg.generic { indexing_maps = [#map1, #map1, #map2], iterator_types = ["parallel"]} diff --git a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir index b3f6ae9f12ee4..fc97685b8378b 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir @@ -1,4 +1,3 @@ -// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py // RUN: mlir-opt %s -sparsification | FileCheck %s #SpVec = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }> @@ -17,9 +16,9 @@ } // CHECK-LABEL: func @mul_inv_dense1d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<4xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<4xf32>, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> { // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 3 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index @@ -57,13 +56,13 @@ func.func @mul_inv_dense1d(%arga: tensor<32xf32, #SpVec>, } // CHECK-LABEL: func.func @mul_inv_sparse1d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>) +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>) // CHECK: %[[VAL_2:.*]] = arith.constant 0 : index // CHECK: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK: %[[VAL_4:.*]] = arith.constant 3 : index // CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_6:.*]] = bufferization.alloc_tensor() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_6:.*]] = tensor.empty() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> // CHECK: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref @@ -95,7 +94,7 @@ func.func @mul_inv_dense1d(%arga: tensor<32xf32, #SpVec>, // CHECK: return %[[VAL_32]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> func.func @mul_inv_sparse1d(%arga: tensor<32xf32, #SpVec>, %argb: tensor<4xf32, #SpVec>) -> tensor<32xf32, #SpVec> { - %argx = bufferization.alloc_tensor() : tensor<32xf32, #SpVec> + %argx = tensor.empty() : tensor<32xf32, #SpVec> %0 = linalg.generic #trait1 ins(%arga, %argb: tensor<32xf32, #SpVec>, tensor<4xf32, #SpVec>) outs(%argx: tensor<32xf32, #SpVec>) { @@ -109,13 +108,13 @@ func.func @mul_inv_sparse1d(%arga: tensor<32xf32, #SpVec>, // CHECK-LABEL: func.func @mul_inv_enc_dense1d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> { // CHECK: %[[VAL_2:.*]] = arith.constant 32 : index // CHECK: %[[VAL_3:.*]] = arith.constant 3 : index // CHECK: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK: %[[VAL_5:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_6:.*]] = bufferization.alloc_tensor() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_6:.*]] = tensor.empty() : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> // CHECK: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<4xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_6]] : tensor<32xf32, #sparse_tensor.encoding<{{{.*}}}>> to memref @@ -132,7 +131,7 @@ func.func @mul_inv_sparse1d(%arga: tensor<32xf32, #SpVec>, // CHECK: } func.func @mul_inv_enc_dense1d(%arga: tensor<32xf32, #EncDenseVec>, %argb: tensor<4xf32, #EncDenseVec>) -> tensor<32xf32, #EncDenseVec> { - %argx = bufferization.alloc_tensor() : tensor<32xf32, #EncDenseVec> + %argx = tensor.empty() : tensor<32xf32, #EncDenseVec> %0 = linalg.generic #trait1 ins(%arga, %argb: tensor<32xf32, #EncDenseVec>, tensor<4xf32, #EncDenseVec>) outs(%argx: tensor<32xf32, #EncDenseVec>) { @@ -155,9 +154,9 @@ func.func @mul_inv_enc_dense1d(%arga: tensor<32xf32, #EncDenseVec>, } // CHECK-LABEL: func @and_affine_dense1d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<34xi32>, -// CHECK-SAME: %[[VAL_2:.*]]: tensor<32xi32>) -> tensor<32xi32> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<34xi32>, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<32xi32>) -> tensor<32xi32> { // CHECK-DAG: %[[ZERO:.*]] = arith.constant 0 : i32 // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index @@ -195,12 +194,12 @@ func.func @and_affine_dense1d(%arga: tensor<32xi32, #SpVec>, } // CHECK-LABEL: func.func @and_affine_sparse1d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>>) +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<34xi32, #sparse_tensor.encoding<{{{.*}}}>>) // CHECK: %[[VAL_2:.*]] = arith.constant 0 : index // CHECK: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK: %[[VAL_4:.*]] = arith.constant 2 : index -// CHECK: %[[VAL_5:.*]] = bufferization.alloc_tensor() : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_5:.*]] = tensor.empty() : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> // CHECK: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> to memref @@ -234,7 +233,7 @@ func.func @and_affine_dense1d(%arga: tensor<32xi32, #SpVec>, // CHECK: return %[[VAL_33]] : tensor<32xi32, #sparse_tensor.encoding<{{{.*}}}>> func.func @and_affine_sparse1d(%arga: tensor<32xi32, #SpVec>, %argb: tensor<34xi32, #SpVec>) -> tensor<32xi32, #SpVec> { - %argx = bufferization.alloc_tensor() : tensor<32xi32, #SpVec> + %argx = tensor.empty() : tensor<32xi32, #SpVec> %0 = linalg.generic #trait2 ins(%arga, %argb: tensor<32xi32, #SpVec>, tensor<34xi32, #SpVec>) outs(%argx: tensor<32xi32, #SpVec>) { @@ -256,9 +255,9 @@ func.func @and_affine_sparse1d(%arga: tensor<32xi32, #SpVec>, } // CHECK-LABEL: func @mul_affine_dense2d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<34x19xf64>, -// CHECK-SAME: %[[VAL_2:.*]]: tensor<32x16xf64>) -> tensor<32x16xf64> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<34x19xf64>, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<32x16xf64>) -> tensor<32x16xf64> { // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 32 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index @@ -304,8 +303,8 @@ func.func @mul_affine_dense2d(%arga: tensor<32x16xf64, #CSR>, // CHECK-LABEL: func.func @mul_affine_sparse2d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<34x19xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> { // CHECK-DAG: %[[VAL_2:.*]] = arith.constant 32 : index // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index @@ -314,7 +313,7 @@ func.func @mul_affine_dense2d(%arga: tensor<32x16xf64, #CSR>, // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 3 : index // CHECK-DAG: %[[VAL_TRUE:.*]] = arith.constant true // CHECK-DAG: %[[VAL_FALSE:.*]] = arith.constant false -// CHECK: %[[VAL_8:.*]] = bufferization.alloc_tensor() : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_8:.*]] = tensor.empty() : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> // CHECK: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref @@ -360,7 +359,7 @@ func.func @mul_affine_dense2d(%arga: tensor<32x16xf64, #CSR>, // CHECK: return %[[VAL_45]] : tensor<32x16xf64, #sparse_tensor.encoding<{{{.*}}}>> func.func @mul_affine_sparse2d(%arga: tensor<32x16xf64, #CSR>, %argb: tensor<34x19xf64, #CSR>) -> tensor<32x16xf64, #CSR> { - %argx = bufferization.alloc_tensor() : tensor<32x16xf64, #CSR> + %argx = tensor.empty() : tensor<32x16xf64, #CSR> %0 = linalg.generic #trait3 ins(%arga, %argb: tensor<32x16xf64, #CSR>, tensor<34x19xf64, #CSR>) outs(%argx: tensor<32x16xf64, #CSR>) { @@ -383,9 +382,9 @@ func.func @mul_affine_sparse2d(%arga: tensor<32x16xf64, #CSR>, } // CHECK-LABEL: func.func @mul_affine_dense_dim_2d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<34x16xf64, #sparse_tensor.encoding -// CHECK-SAME: %[[VAL_1:.*]]: tensor<32x19xf64, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_2:.*]]: tensor<32x16xf64>) -> tensor<32x16xf64> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<34x16xf64, #sparse_tensor.encoding +// CHECK-SAME: %[[VAL_1:.*]]: tensor<32x19xf64, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<32x16xf64>) -> tensor<32x16xf64> { // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 19 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index @@ -447,9 +446,9 @@ func.func @mul_affine_dense_dim_2d(%arga: tensor<34x16xf64, #CSR>, } // CHECK-LABEL: func.func @mul_const_affine_dense_dim_2d( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<34x16xf64, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<32x19xf64, #sparse_tensor.encoding<{{{.*}}}>>, -// CHECK-SAME: %[[VAL_2:.*]]: tensor<32x16xf64>) -> tensor<32x16xf64> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<34x16xf64, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<32x19xf64, #sparse_tensor.encoding<{{{.*}}}>>, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<32x16xf64>) -> tensor<32x16xf64> { // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 19 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 2 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index diff --git a/mlir/test/Dialect/SparseTensor/sparse_broadcast.mlir b/mlir/test/Dialect/SparseTensor/sparse_broadcast.mlir index ae5b941259f65..1a5f79d23cba2 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_broadcast.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_broadcast.mlir @@ -16,7 +16,7 @@ // CHECK-DAG: %[[TMP_c3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[TMP_c0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[TMP_c1:.*]] = arith.constant 1 : index -// CHECK: %[[TMP_0:.*]] = bufferization.alloc_tensor() +// CHECK: %[[TMP_0:.*]] = tensor.empty() // CHECK: %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} // CHECK: %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} // CHECK: %[[TMP_3:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index} @@ -44,7 +44,7 @@ // CHECK: return %[[TMP_8]] module @func_sparse { func.func public @main(%arg0: tensor<4x5xi32, #DCSR>) -> tensor<4x3x5xi32, #SparseTensor> { - %0 = bufferization.alloc_tensor() : tensor<4x3x5xi32, #SparseTensor> + %0 = tensor.empty() : tensor<4x3x5xi32, #SparseTensor> %1 = linalg.generic #trait ins(%arg0 : tensor<4x5xi32, #DCSR>) outs(%0 : tensor<4x3x5xi32, #SparseTensor>) { ^bb0(%in: i32, %out: i32): diff --git a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir index ee3613a268def..d19d7fe2871d6 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir @@ -67,7 +67,7 @@ func.func @kernel(%arga: tensor) -> tensor { %c0 = arith.constant 0 : index %n = tensor.dim %arga, %c0 : tensor - %v = bufferization.alloc_tensor(%n) : tensor + %v = tensor.empty(%n) : tensor %0 = linalg.generic #rowsum ins(%arga: tensor) outs(%v: tensor) { @@ -119,7 +119,7 @@ func.func @kernel(%arga: tensor) -> tensor { // func.func @matmul1(%A: tensor<8x2xf64, #CSR>, %B: tensor<2x4xf64, #CSR>) -> tensor<8x4xf64, #CSR> { - %C = bufferization.alloc_tensor() : tensor<8x4xf64, #CSR> + %C = tensor.empty() : tensor<8x4xf64, #CSR> %D = linalg.matmul ins(%A, %B: tensor<8x2xf64, #CSR>, tensor<2x4xf64, #CSR>) outs(%C: tensor<8x4xf64, #CSR>) -> tensor<8x4xf64, #CSR> @@ -167,7 +167,7 @@ func.func @matmul1(%A: tensor<8x2xf64, #CSR>, // func.func @matmul2(%A: tensor<8x2xf64, #CSC>, %B: tensor<2x4xf64, #CSC>) -> tensor<8x4xf64, #CSC> { - %C = bufferization.alloc_tensor() : tensor<8x4xf64, #CSC> + %C = tensor.empty() : tensor<8x4xf64, #CSC> %D = linalg.matmul ins(%A, %B: tensor<8x2xf64, #CSC>, tensor<2x4xf64, #CSC>) outs(%C: tensor<8x4xf64, #CSC>) -> tensor<8x4xf64, #CSC> diff --git a/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir b/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir index 3e18b0c1b6c1b..dac34da30b49f 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir @@ -351,13 +351,13 @@ func.func @divbyc(%arga: tensor<32xf64, #SV>, } // CHECK-LABEL: func.func @zero_preserving_math( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf64, #sparse_tensor.encoding<{{{.*}}}>> { // CHECK-DAG: %[[VAL_1:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_2:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_3:.*]] = bufferization.alloc_tensor() : tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> -// CHECK: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref -// CHECK: %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref -// CHECK: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref +// CHECK: %[[VAL_3:.*]] = tensor.empty() : tensor<32xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref // CHECK: %[[VAL_7:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_1]]] : memref // CHECK: %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[T:.*]] = scf.for %[[VAL_9:.*]] = %[[VAL_7]] to %[[VAL_8]] step %[[VAL_2]] {{.*}} { @@ -371,15 +371,15 @@ func.func @divbyc(%arga: tensor<32xf64, #SV>, // CHECK: %[[VAL_17:.*]] = math.log1p %[[VAL_16]] : f64 // CHECK: %[[VAL_18:.*]] = math.sin %[[VAL_17]] : f64 // CHECK: %[[VAL_19:.*]] = math.tanh %[[VAL_18]] : f64 -// CHECK: %[[Y:.*]] = sparse_tensor.insert %[[VAL_19]] into %{{.*}}[%[[VAL_10]]] : tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> +// CHECK: %[[Y:.*]] = sparse_tensor.insert %[[VAL_19]] into %{{.*}}[%[[VAL_10]]] : tensor<32xf64, #sparse_tensor.encoding<{{{.*}}}>> // CHECK: scf.yield %[[Y]] // CHECK: } -// CHECK: %[[VAL_20:.*]] = sparse_tensor.load %[[T]] hasInserts : tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> -// CHECK: return %[[VAL_20]] : tensor<32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> +// CHECK: %[[VAL_20:.*]] = sparse_tensor.load %[[T]] hasInserts : tensor<32xf64, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: return %[[VAL_20]] : tensor<32xf64, #sparse_tensor.encoding<{{{.*}}}>> // CHECK: } func.func @zero_preserving_math(%arga: tensor<32xf64, #SV>) -> tensor<32xf64, #SV> { %c32 = arith.constant 32 : index - %xinp = bufferization.alloc_tensor() : tensor<32xf64, #SV> + %xinp = tensor.empty() : tensor<32xf64, #SV> %0 = linalg.generic #trait1 ins(%arga: tensor<32xf64, #SV>) outs(%xinp: tensor<32xf64, #SV>) { @@ -398,29 +398,29 @@ func.func @zero_preserving_math(%arga: tensor<32xf64, #SV>) -> tensor<32xf64, #S } // CHECK-LABEL: func.func @complex_divbyc( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> { +// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xcomplex, #sparse_tensor.encoding<{{{.*}}}>> { // CHECK-DAG: %[[VAL_1:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_2:.*]] = arith.constant 1 : index // CHECK: %[[VAL_3:.*]] = complex.constant [0.000000e+00, 1.000000e+00] : complex -// CHECK: %[[VAL_4:.*]] = bufferization.alloc_tensor() : tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> -// CHECK: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref -// CHECK: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref -// CHECK: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> to memref> +// CHECK: %[[VAL_4:.*]] = tensor.empty() : tensor<32xcomplex, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xcomplex, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xcomplex, #sparse_tensor.encoding<{{{.*}}}>> to memref +// CHECK: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xcomplex, #sparse_tensor.encoding<{{{.*}}}>> to memref> // CHECK: %[[VAL_8:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_1]]] : memref // CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[T:.*]] = scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_2]] {{.*}} { // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_10]]] : memref // CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_10]]] : memref> // CHECK: %[[VAL_13:.*]] = complex.div %[[VAL_12]], %[[VAL_3]] : complex -// CHECK: %[[Y:.*]] = sparse_tensor.insert %[[VAL_13]] into %{{.*}}[%[[VAL_11]]] : tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> +// CHECK: %[[Y:.*]] = sparse_tensor.insert %[[VAL_13]] into %{{.*}}[%[[VAL_11]]] : tensor<32xcomplex, #sparse_tensor.encoding<{{{.*}}}>> // CHECK: scf.yield %[[Y]] // CHECK: } -// CHECK: %[[VAL_14:.*]] = sparse_tensor.load %[[T]] hasInserts : tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> -// CHECK: return %[[VAL_14]] : tensor<32xcomplex, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>> +// CHECK: %[[VAL_14:.*]] = sparse_tensor.load %[[T]] hasInserts : tensor<32xcomplex, #sparse_tensor.encoding<{{{.*}}}>> +// CHECK: return %[[VAL_14]] : tensor<32xcomplex, #sparse_tensor.encoding<{{{.*}}}>> // CHECK: } func.func @complex_divbyc(%arg0: tensor<32xcomplex, #SV>) -> tensor<32xcomplex, #SV> { %c = complex.constant [0.0, 1.0] : complex - %init = bufferization.alloc_tensor() : tensor<32xcomplex, #SV> + %init = tensor.empty() : tensor<32xcomplex, #SV> %0 = linalg.generic #traitc ins(%arg0: tensor<32xcomplex, #SV>) outs(%init: tensor<32xcomplex, #SV>) { diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir index 7f14934a4ef20..a21fa7b35b543 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir @@ -108,7 +108,7 @@ func.func @matmul_sparse_rhs(%a: tensor<10x20xf32>, // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant false // CHECK-DAG: %[[VAL_5:.*]] = arith.constant true -// CHECK-DAG: %[[VAL_6:.*]] = bufferization.alloc_tensor() : tensor<4x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> +// CHECK-DAG: %[[VAL_6:.*]] = tensor.empty() : tensor<4x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<4x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<4x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<4x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to memref @@ -188,7 +188,7 @@ func.func @matmul_sparse_rhs(%a: tensor<10x20xf32>, func.func @matmul2(%A: tensor<4x8xf64, #DCSR>, %B: tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR> { %c4 = arith.constant 4 : index - %C = bufferization.alloc_tensor() : tensor<4x4xf64, #DCSR> + %C = tensor.empty() : tensor<4x4xf64, #DCSR> %D = linalg.matmul ins(%A, %B: tensor<4x8xf64, #DCSR>, tensor<8x4xf64, #DCSR>) outs(%C: tensor<4x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR> diff --git a/mlir/test/Dialect/SparseTensor/sparse_out.mlir b/mlir/test/Dialect/SparseTensor/sparse_out.mlir index 97d8da213423d..9aee12f0d3cef 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_out.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_out.mlir @@ -102,7 +102,7 @@ func.func @sparse_simply_dynamic2(%argx: tensor<32x16xf32, #DCSR>) -> tensor<32x // CHECK-DAG: %[[VAL_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_3:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_4:.*]] = arith.constant 2.000000e+00 : f32 -// CHECK-DAG: %[[VAL_5:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> +// CHECK-DAG: %[[VAL_5:.*]] = tensor.empty() : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> // CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref // CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref @@ -124,7 +124,7 @@ func.func @sparse_simply_dynamic2(%argx: tensor<32x16xf32, #DCSR>) -> tensor<32x // CHECK: } func.func @sparse_truly_dynamic(%arga: tensor<10x20xf32, #CSR>) -> tensor<10x20xf32, #DCSR> { %s = arith.constant 2.0 : f32 - %xm = bufferization.alloc_tensor() : tensor<10x20xf32, #DCSR> + %xm = tensor.empty() : tensor<10x20xf32, #DCSR> %0 = linalg.generic #trait_scale ins(%arga: tensor<10x20xf32, #CSR>) outs(%xm: tensor<10x20xf32, #DCSR>) { @@ -155,7 +155,7 @@ func.func @sparse_truly_dynamic(%arga: tensor<10x20xf32, #CSR>) -> tensor<10x20x // CHECK-DAG: %[[VAL_TRUE:.*]] = arith.constant true // CHECK: %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor> // CHECK: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor> -// CHECK: %[[VAL_7:.*]] = bufferization.alloc_tensor(%[[VAL_5]], %[[VAL_6]]) : tensor> +// CHECK: %[[VAL_7:.*]] = tensor.empty(%[[VAL_5]], %[[VAL_6]]) : tensor> // CHECK: %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor> to memref // CHECK: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor> to memref // CHECK: %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor> to memref @@ -286,7 +286,7 @@ func.func @sumred(%arga: tensor, %c1 = arith.constant 1 : index %d0 = tensor.dim %arga, %c0 : tensor %d1 = tensor.dim %arga, %c1 : tensor - %xinit = bufferization.alloc_tensor(%d0, %d1) : tensor + %xinit = tensor.empty(%d0, %d1) : tensor %0 = linalg.generic #trait_sumred ins(%arga, %argb: tensor, tensor) @@ -318,7 +318,7 @@ func.func @sumred(%arga: tensor, // CHECK-DAG: %[[VAL_5:.*]] = arith.constant true // CHECK: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor> // CHECK: %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor> -// CHECK: %[[VAL_8:.*]] = bufferization.alloc_tensor(%[[VAL_6]], %[[VAL_7]]) : tensor> +// CHECK: %[[VAL_8:.*]] = tensor.empty(%[[VAL_6]], %[[VAL_7]]) : tensor> // CHECK: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor> to memref // CHECK: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor> to memref // CHECK: %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor> to memref @@ -401,7 +401,7 @@ func.func @matmat(%arga: tensor, %c1 = arith.constant 1 : index %d0 = tensor.dim %arga, %c0 : tensor %d1 = tensor.dim %argb, %c1 : tensor - %cinit = bufferization.alloc_tensor(%d0, %d1) : tensor + %cinit = tensor.empty(%d0, %d1) : tensor %0 = linalg.generic #trait_matmat ins(%arga, %argb: tensor, tensor) diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir index 8e23f901da765..67841beaa6933 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_vector_ops.mlir @@ -48,7 +48,7 @@ // CHECK: } func.func @vops(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32, #DenseVector>) -> tensor<1024xf32> { - %init = bufferization.alloc_tensor() : tensor<1024xf32> + %init = tensor.empty() : tensor<1024xf32> %o = arith.constant 1.0 : f32 %c = arith.constant 2.0 : f32 %i = arith.constant 255 : i64 diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir index 04877b1b21e1a..9052744a1d3f9 100644 --- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir @@ -380,3 +380,24 @@ func.func @tensor.reshape() -> tensor<2x2x5xf32> { // CHECK: return %[[RESHAPED]] return %reshaped : tensor<2x2x5xf32> } + +// ----- + +// CHECK-LABEL: @reshape_with_non_identity_layout( +// CHECK-SAME: %[[INPUT:[a-zA-Z0-9]*]]: memref<2x2xf32, strided<[?, ?], offset: ?>>, +// CHECK-SAME: %[[LAYOUT:[a-zA-Z0-9]*]]: memref<2xi32, strided<[?], offset: ?>>) +func.func @reshape_with_non_identity_layout(%arg0: tensor<2x2xf32>, %arg1: tensor<2xi32>) -> tensor<1x2xf32> { + + // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[INPUT]][1, 0] [1, 2] [1, 1] : memref<2x2xf32, strided<[?, ?], offset: ?>> to memref<2xf32, strided<[?], offset: ?>> + %extracted_slice = tensor.extract_slice %arg0[1, 0] [1, 2] [1, 1] : tensor<2x2xf32> to tensor<2xf32> + + // To satisify the constraints of memref.reshape, the subview must be cloned into + // a buffer with an identity layout. + // CHECK: %[[CLONED:.+]] = bufferization.clone %[[SUBVIEW]] : memref<2xf32, strided<[?], offset: ?>> to memref<2xf32> + // CHECK: %[[RESHAPED:.+]] = memref.reshape %[[CLONED]](%[[LAYOUT]]) : (memref<2xf32>, memref<2xi32, strided<[?], offset: ?>>) -> memref<1x2xf32> + + %reshape = tensor.reshape %extracted_slice(%arg1) : (tensor<2xf32>, tensor<2xi32>) -> tensor<1x2xf32> + + // CHECK: return %[[RESHAPED]] : memref<1x2xf32> + return %reshape : tensor<1x2xf32> +} diff --git a/mlir/test/Dialect/Transform/ops-invalid.mlir b/mlir/test/Dialect/Transform/ops-invalid.mlir index 3e30947769eb4..0964161588798 100644 --- a/mlir/test/Dialect/Transform/ops-invalid.mlir +++ b/mlir/test/Dialect/Transform/ops-invalid.mlir @@ -22,6 +22,16 @@ transform.sequence failures(propagate) { } } +// ----- + +// expected-error @below {{expects to have a terminator in the body}} +"transform.sequence"() <{failure_propagation_mode = 1 : i32, operandSegmentSizes = array}> ({ +^bb0(%arg0: !transform.any_op): + transform.apply_patterns to %arg0 { + } : !transform.any_op +}) : () -> () + + // ----- // expected-error @below {{'transform.sequence' op expects trailing entry block arguments to be of type implementing TransformHandleTypeInterface, TransformValueHandleTypeInterface or TransformParamTypeInterface}} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir index c3bdc30e355b1..ca5dd00d02aff 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_rewrite_sort_coo.mlir @@ -94,14 +94,14 @@ module { %y1 = memref.cast %y1s : memref<7xi32> to memref // Sort "parallel arrays". - // CHECK: ( 1, 1, 2, 5, 10 ) - // CHECK: ( 3, 3, 1, 10, 1 ) - // CHECK: ( 9, 9, 4, 7, 2 ) - // CHECK: ( 7, 8, 10, 9, 6 ) - // CHECK: ( 7, 4, 7, 9, 5 ) - call @storeValuesToStrided(%x0, %c10, %c2, %c1, %c5, %c1) + // CHECK: ( 1, 1, 3, 3, 10 ) + // CHECK: ( 2, 10, 1, 1, 5 ) + // CHECK: ( 4, 2, 9, 9, 7 ) + // CHECK: ( 10, 6, 7, 8, 9 ) + // CHECK: ( 7, 5, 7, 4, 9 ) + call @storeValuesToStrided(%x0, %c1, %c1, %c3, %c10, %c3) : (memref>, i32, i32, i32, i32, i32) -> () - call @storeValuesToStrided(%x1, %c1, %c1, %c3, %c10, %c3) + call @storeValuesToStrided(%x1, %c10, %c2, %c1, %c5, %c1) : (memref>, i32, i32, i32, i32, i32) -> () call @storeValuesToStrided(%x2, %c2, %c4, %c9, %c7, %c9) : (memref>, i32, i32, i32, i32, i32) -> () @@ -122,14 +122,14 @@ module { %y1v = vector.transfer_read %y1[%i0], %c100: memref, vector<5xi32> vector.print %y1v : vector<5xi32> // Stable sort. - // CHECK: ( 1, 1, 2, 5, 10 ) - // CHECK: ( 3, 3, 1, 10, 1 ) - // CHECK: ( 9, 9, 4, 7, 2 ) - // CHECK: ( 8, 7, 10, 9, 6 ) - // CHECK: ( 4, 7, 7, 9, 5 ) - call @storeValuesToStrided(%x0, %c10, %c2, %c1, %c5, %c1) + // CHECK: ( 1, 1, 3, 3, 10 ) + // CHECK: ( 2, 10, 1, 1, 5 ) + // CHECK: ( 4, 2, 9, 9, 7 ) + // CHECK: ( 10, 6, 8, 7, 9 ) + // CHECK: ( 7, 5, 4, 7, 9 ) + call @storeValuesToStrided(%x0, %c1, %c1, %c3, %c10, %c3) : (memref>, i32, i32, i32, i32, i32) -> () - call @storeValuesToStrided(%x1, %c1, %c1, %c3, %c10, %c3) + call @storeValuesToStrided(%x1, %c10, %c2, %c1, %c5, %c1) : (memref>, i32, i32, i32, i32, i32) -> () call @storeValuesToStrided(%x2, %c2, %c4, %c9, %c7, %c9) : (memref>, i32, i32, i32, i32, i32) -> () @@ -150,14 +150,14 @@ module { %y1v2 = vector.transfer_read %y1[%i0], %c100: memref, vector<5xi32> vector.print %y1v2 : vector<5xi32> // Heap sort. - // CHECK: ( 1, 1, 2, 5, 10 ) - // CHECK: ( 3, 3, 1, 10, 1 ) - // CHECK: ( 9, 9, 4, 7, 2 ) - // CHECK: ( 7, 8, 10, 9, 6 ) - // CHECK: ( 7, 4, 7, 9, 5 ) - call @storeValuesToStrided(%x0, %c10, %c2, %c1, %c5, %c1) + // CHECK: ( 1, 1, 3, 3, 10 ) + // CHECK: ( 2, 10, 1, 1, 5 ) + // CHECK: ( 4, 2, 9, 9, 7 ) + // CHECK: ( 10, 6, 8, 7, 9 ) + // CHECK: ( 7, 5, 4, 7, 9 ) + call @storeValuesToStrided(%x0, %c1, %c1, %c3, %c10, %c3) : (memref>, i32, i32, i32, i32, i32) -> () - call @storeValuesToStrided(%x1, %c1, %c1, %c3, %c10, %c3) + call @storeValuesToStrided(%x1, %c10, %c2, %c1, %c5, %c1) : (memref>, i32, i32, i32, i32, i32) -> () call @storeValuesToStrided(%x2, %c2, %c4, %c9, %c7, %c9) : (memref>, i32, i32, i32, i32, i32) -> () diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir index 4d92ce4cb6a1f..5ed76fd5a1eda 100644 --- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir @@ -2,10 +2,11 @@ llvm.func @_QPopenmp_target_data() { %0 = llvm.mlir.constant(1 : i64) : i64 - %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operandSegmentSizes = array, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr - omp.target_data map((tofrom -> %1 : !llvm.ptr)) { - %2 = llvm.mlir.constant(99 : i32) : i32 - llvm.store %2, %1 : !llvm.ptr + %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr + %2 = omp.map_info var_ptr(%1 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_data map_entries(%2 : !llvm.ptr) { + %3 = llvm.mlir.constant(99 : i32) : i32 + llvm.store %3, %1 : !llvm.ptr omp.terminator } llvm.return @@ -38,13 +39,14 @@ llvm.func @_QPopenmp_target_data() { // ----- llvm.func @_QPopenmp_target_data_region(%1 : !llvm.ptr>) { - omp.target_data map((from -> %1 : !llvm.ptr>)) { - %2 = llvm.mlir.constant(99 : i32) : i32 - %3 = llvm.mlir.constant(1 : i64) : i64 + %2 = omp.map_info var_ptr(%1 : !llvm.ptr>) map_clauses(from) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target_data map_entries(%2 : !llvm.ptr>) { + %3 = llvm.mlir.constant(99 : i32) : i32 %4 = llvm.mlir.constant(1 : i64) : i64 - %5 = llvm.mlir.constant(0 : i64) : i64 - %6 = llvm.getelementptr %1[0, %5] : (!llvm.ptr>, i64) -> !llvm.ptr - llvm.store %2, %6 : !llvm.ptr + %5 = llvm.mlir.constant(1 : i64) : i64 + %6 = llvm.mlir.constant(0 : i64) : i64 + %7 = llvm.getelementptr %1[0, %6] : (!llvm.ptr>, i64) -> !llvm.ptr + llvm.store %3, %7 : !llvm.ptr omp.terminator } llvm.return @@ -90,12 +92,16 @@ llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr>, %3 : !llv %11 = llvm.mlir.constant(10 : i32) : i32 %12 = llvm.icmp "slt" %10, %11 : i32 %13 = llvm.load %5 : !llvm.ptr - omp.target_enter_data if(%12 : i1) device(%13 : i32) map((to -> %1 : !llvm.ptr>), (alloc -> %3 : !llvm.ptr>)) + %map1 = omp.map_info var_ptr(%1 : !llvm.ptr>) map_clauses(to) capture(ByRef) -> !llvm.ptr> {name = ""} + %map2 = omp.map_info var_ptr(%3 : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target_enter_data if(%12 : i1) device(%13 : i32) map_entries(%map1, %map2 : !llvm.ptr>, !llvm.ptr>) %14 = llvm.load %7 : !llvm.ptr %15 = llvm.mlir.constant(10 : i32) : i32 %16 = llvm.icmp "sgt" %14, %15 : i32 %17 = llvm.load %5 : !llvm.ptr - omp.target_exit_data if(%16 : i1) device(%17 : i32) map((from -> %1 : !llvm.ptr>), (release -> %3 : !llvm.ptr>)) + %map3 = omp.map_info var_ptr(%1 : !llvm.ptr>) map_clauses(from) capture(ByRef) -> !llvm.ptr> {name = ""} + %map4 = omp.map_info var_ptr(%3 : !llvm.ptr>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target_exit_data if(%16 : i1) device(%17 : i32) map_entries(%map3, %map4 : !llvm.ptr>, !llvm.ptr>) llvm.return } @@ -172,7 +178,8 @@ llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr>, %3 : !llv llvm.func @_QPopenmp_target_use_dev_ptr() { %0 = llvm.mlir.constant(1 : i64) : i64 %a = llvm.alloca %0 x !llvm.ptr> : (i64) -> !llvm.ptr> - omp.target_data map((from -> %a : !llvm.ptr>)) use_device_ptr(%a : !llvm.ptr>) { + %map1 = omp.map_info var_ptr(%a : !llvm.ptr>) map_clauses(from) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target_data map_entries(%map1 : !llvm.ptr>) use_device_ptr(%a : !llvm.ptr>) { ^bb0(%arg0: !llvm.ptr>): %1 = llvm.mlir.constant(10 : i32) : i32 %2 = llvm.load %arg0 : !llvm.ptr> @@ -215,7 +222,8 @@ llvm.func @_QPopenmp_target_use_dev_ptr() { llvm.func @_QPopenmp_target_use_dev_addr() { %0 = llvm.mlir.constant(1 : i64) : i64 %a = llvm.alloca %0 x !llvm.ptr> : (i64) -> !llvm.ptr> - omp.target_data map((from -> %a : !llvm.ptr>)) use_device_addr(%a : !llvm.ptr>) { + %map = omp.map_info var_ptr(%a : !llvm.ptr>) map_clauses(from) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target_data map_entries(%map : !llvm.ptr>) use_device_addr(%a : !llvm.ptr>) { ^bb0(%arg0: !llvm.ptr>): %1 = llvm.mlir.constant(10 : i32) : i32 %2 = llvm.load %arg0 : !llvm.ptr> @@ -256,7 +264,8 @@ llvm.func @_QPopenmp_target_use_dev_addr() { llvm.func @_QPopenmp_target_use_dev_addr_no_ptr() { %0 = llvm.mlir.constant(1 : i64) : i64 %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr - omp.target_data map((tofrom -> %a : !llvm.ptr)) use_device_addr(%a : !llvm.ptr) { + %map = omp.map_info var_ptr(%a : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target_data map_entries(%map : !llvm.ptr) use_device_addr(%a : !llvm.ptr) { ^bb0(%arg0: !llvm.ptr): %1 = llvm.mlir.constant(10 : i32) : i32 llvm.store %1, %arg0 : !llvm.ptr @@ -297,7 +306,8 @@ llvm.func @_QPopenmp_target_use_dev_addr_nomap() { %a = llvm.alloca %0 x !llvm.ptr> : (i64) -> !llvm.ptr> %1 = llvm.mlir.constant(1 : i64) : i64 %b = llvm.alloca %0 x !llvm.ptr> : (i64) -> !llvm.ptr> - omp.target_data map((from -> %b : !llvm.ptr>)) use_device_addr(%a : !llvm.ptr>) { + %map = omp.map_info var_ptr(%b : !llvm.ptr>) map_clauses(from) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target_data map_entries(%map : !llvm.ptr>) use_device_addr(%a : !llvm.ptr>) { ^bb0(%arg0: !llvm.ptr>): %2 = llvm.mlir.constant(10 : i32) : i32 %3 = llvm.load %arg0 : !llvm.ptr> @@ -352,7 +362,9 @@ llvm.func @_QPopenmp_target_use_dev_both() { %a = llvm.alloca %0 x !llvm.ptr> : (i64) -> !llvm.ptr> %1 = llvm.mlir.constant(1 : i64) : i64 %b = llvm.alloca %0 x !llvm.ptr> : (i64) -> !llvm.ptr> - omp.target_data map((tofrom -> %a : !llvm.ptr>), (tofrom -> %b : !llvm.ptr>)) use_device_ptr(%a : !llvm.ptr>) use_device_addr(%b : !llvm.ptr>) { + %map = omp.map_info var_ptr(%a : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr> {name = ""} + %map1 = omp.map_info var_ptr(%b : !llvm.ptr>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr> {name = ""} + omp.target_data map_entries(%map, %map1 : !llvm.ptr>, !llvm.ptr>) use_device_ptr(%a : !llvm.ptr>) use_device_addr(%b : !llvm.ptr>) { ^bb0(%arg0: !llvm.ptr>, %arg1: !llvm.ptr>): %2 = llvm.mlir.constant(10 : i32) : i32 %3 = llvm.load %arg0 : !llvm.ptr> diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir index 15b515de5e9f4..cf70469e7484f 100644 --- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir @@ -12,7 +12,10 @@ module attributes {omp.is_target_device = true} { %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr llvm.store %1, %3 : !llvm.ptr llvm.store %0, %5 : !llvm.ptr - omp.target map((to -> %3 : !llvm.ptr), (to -> %5 : !llvm.ptr), (from -> %7 : !llvm.ptr)) { + %map1 = omp.map_info var_ptr(%3 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + %map2 = omp.map_info var_ptr(%5 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + %map3 = omp.map_info var_ptr(%7 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target map_entries(%map1, %map2, %map3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) { %8 = llvm.load %3 : !llvm.ptr %9 = llvm.load %5 : !llvm.ptr %10 = llvm.add %8, %9 : i32 diff --git a/mlir/test/Target/LLVMIR/omptarget-region-llvm-target-device.mlir b/mlir/test/Target/LLVMIR/omptarget-region-llvm-target-device.mlir index bee77bbea2cde..01a3bd556294f 100644 --- a/mlir/test/Target/LLVMIR/omptarget-region-llvm-target-device.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-region-llvm-target-device.mlir @@ -4,15 +4,17 @@ module attributes {omp.is_target_device = true} { llvm.func @writeindex_omp_outline_0_(%arg0: !llvm.ptr, %arg1: !llvm.ptr) attributes {omp.outline_parent_name = "writeindex_"} { - omp.target map((from -> %arg0 : !llvm.ptr), (implicit -> %arg1: !llvm.ptr)) { - %0 = llvm.mlir.constant(20 : i32) : i32 - %1 = llvm.mlir.constant(10 : i32) : i32 - llvm.store %1, %arg0 : !llvm.ptr - llvm.store %0, %arg1 : !llvm.ptr + %0 = omp.map_info var_ptr(%arg0 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + %1 = omp.map_info var_ptr(%arg1 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target map_entries(%0, %1 : !llvm.ptr, !llvm.ptr) { + %2 = llvm.mlir.constant(20 : i32) : i32 + %3 = llvm.mlir.constant(10 : i32) : i32 + llvm.store %3, %arg0 : !llvm.ptr + llvm.store %2, %arg1 : !llvm.ptr omp.terminator } llvm.return } } -// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}_writeindex__l7(ptr {{.*}}, ptr {{.*}}) { +// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}_writeindex__l{{.*}}(ptr {{.*}}, ptr {{.*}}) { diff --git a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir index fa77540c1c911..51506386d8378 100644 --- a/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-region-llvm.mlir @@ -12,7 +12,10 @@ module attributes {omp.is_target_device = false} { %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr llvm.store %1, %3 : !llvm.ptr llvm.store %0, %5 : !llvm.ptr - omp.target map((to -> %3 : !llvm.ptr), (to -> %5 : !llvm.ptr), (from -> %7 : !llvm.ptr)) { + %map1 = omp.map_info var_ptr(%3 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + %map2 = omp.map_info var_ptr(%5 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + %map3 = omp.map_info var_ptr(%7 : !llvm.ptr) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target map_entries(%map1, %map2, %map3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) { %8 = llvm.load %3 : !llvm.ptr %9 = llvm.load %5 : !llvm.ptr %10 = llvm.add %8, %9 : i32 diff --git a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir index dce2f58487da1..f0bd37ca36e93 100644 --- a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir @@ -12,7 +12,10 @@ module attributes {omp.is_target_device = false} { %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr llvm.store %1, %3 : !llvm.ptr llvm.store %0, %5 : !llvm.ptr - omp.target map((to -> %3 : !llvm.ptr), (to -> %5 : !llvm.ptr), (from -> %7 : !llvm.ptr)) { + %map1 = omp.map_info var_ptr(%3 : !llvm.ptr) map_clauses(tofrommap_info) capture(ByRef) -> !llvm.ptr {name = ""} + %map2 = omp.map_info var_ptr(%5 : !llvm.ptr) map_clauses(tofrommap_info) capture(ByRef) -> !llvm.ptr {name = ""} + %map3 = omp.map_info var_ptr(%7 : !llvm.ptr) map_clauses(tofrommap_info) capture(ByRef) -> !llvm.ptr {name = ""} + omp.target map_entries( %map1, %map2, %map3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) { omp.parallel { %8 = llvm.load %3 : !llvm.ptr %9 = llvm.load %5 : !llvm.ptr diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir index 4526b23fa503d..e9b1e22359001 100644 --- a/mlir/test/Transforms/loop-invariant-code-motion.mlir +++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir @@ -929,3 +929,30 @@ func.func @speculate_dynamic_pack_and_unpack(%source: tensor, } return } + +// ----- + +// CHECK-LABEL: func @hoist_from_scf_while( +// CHECK-SAME: %[[arg0:.*]]: i32, %{{.*}}: i32) +// CHECK-DAG: arith.constant 1 : i32 +// CHECK-DAG: %[[c2:.*]] = arith.constant 2 : i32 +// CHECK-DAG: %[[c10:.*]] = arith.constant 10 : i32 +// CHECK-DAG: %[[added:.*]] = arith.addi %[[arg0]], %[[c2]] +// CHECK: scf.while +// CHECK: %[[cmpi:.*]] = arith.cmpi slt, %{{.*}}, %[[added]] +// CHECK: scf.condition(%[[cmpi]]) +func.func @hoist_from_scf_while(%arg0: i32, %arg1: i32) -> i32 { + %0 = scf.while (%arg2 = %arg1) : (i32) -> (i32) { + %c2 = arith.constant 2 : i32 + %c10 = arith.constant 10 : i32 + %added = arith.addi %arg0, %c2 : i32 + %1 = arith.cmpi slt, %arg2, %added : i32 + scf.condition(%1) %arg2 : i32 + } do { + ^bb0(%arg2: i32): + %c1 = arith.constant 1 : i32 + %added2 = arith.addi %c1, %arg2 : i32 + scf.yield %added2 : i32 + } + return %0 : i32 +} diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 0aa8ce4de9756..354a43c244e3b 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -2570,7 +2570,7 @@ def TestGraphLoopOp : TEST_Op<"graph_loop", }]; let extraClassDeclaration = [{ - mlir::Region &getLoopBody() { return getBody(); } + llvm::SmallVector getLoopRegions() { return {&getBody()}; } }]; } diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterface.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterface.cpp index 752c885e0b87b..2fcc7bcadb604 100644 --- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterface.cpp +++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterface.cpp @@ -450,7 +450,9 @@ static void addPatternForTiling(MLIRContext *context, ArrayRef tileSizes, ArrayRef interchange = {}) { scf::SCFTilingOptions tilingOptions; - tilingOptions.setTileSizes(tileSizes).setInterchange(interchange); + SmallVector tileSizesOfr = + getAsIndexOpFoldResult(context, tileSizes); + tilingOptions.setTileSizes(tileSizesOfr).setInterchange(interchange); LinalgTransformationFilter filter(StringAttr::get(context, filterName), StringAttr::get(context, "tiled")); patterns.add(context, tilingOptions, filter); @@ -462,7 +464,9 @@ static void addPatternForTileFuseAndYield(MLIRContext *context, ArrayRef tileSizes, ArrayRef interchange = {}) { scf::SCFTilingOptions tilingOptions; - tilingOptions.setTileSizes(tileSizes).setInterchange(interchange); + SmallVector tileSizesOfr = + getAsIndexOpFoldResult(context, tileSizes); + tilingOptions.setTileSizes(tileSizesOfr).setInterchange(interchange); LinalgTransformationFilter filter(StringAttr::get(context, filterName), StringAttr::get(context, "tiled")); patterns.add( @@ -475,8 +479,10 @@ static void addPatternForTileAndFuse(MLIRContext *context, ArrayRef tileSizes, ArrayRef interchange = {}) { scf::SCFTileAndFuseOptions tileAndFuseOptions; - tileAndFuseOptions.tilingOptions.setTileSizes(tileSizes).setInterchange( - interchange); + SmallVector tileSizesOfr = + getAsIndexOpFoldResult(context, tileSizes); + tileAndFuseOptions.tilingOptions.setTileSizes(tileSizesOfr) + .setInterchange(interchange); LinalgTransformationFilter filter(StringAttr::get(context, filterName), StringAttr::get(context, "tiled")); patterns.add( diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py index 5d5ee945b6686..69181160d5489 100644 --- a/mlir/test/python/dialects/transform_structured_ext.py +++ b/mlir/test/python/dialects/transform_structured_ext.py @@ -169,6 +169,16 @@ def testMatchOpNamesList(target): # CHECK-SAME: (!transform.any_op) -> !transform.any_op +@run +@create_sequence +def testMaskedVectorizeNoArgs(target): + structured.MaskedVectorizeOp(target) + # CHECK-LABEL: TEST: testMaskedVectorizeNoArgs + # CHECK: transform.sequence + # CHECK: transform.structured.masked_vectorize + # CHECK-NOT: vector_sizes + + @run @create_sequence def testMaskedVectorizeStatic(target): diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp index c91dd76657570..a226cc2a1b250 100644 --- a/polly/lib/CodeGen/IslNodeBuilder.cpp +++ b/polly/lib/CodeGen/IslNodeBuilder.cpp @@ -1292,9 +1292,9 @@ void IslNodeBuilder::allocateNewArrays(BBPair StartExitBlocks) { unsigned Size = SAI->getElemSizeInBytes(); // Insert the malloc call at polly.start - auto InstIt = std::get<0>(StartExitBlocks)->getTerminator(); - auto *CreatedArray = CallInst::CreateMalloc( - &*InstIt, IntPtrTy, SAI->getElementType(), + Builder.SetInsertPoint(std::get<0>(StartExitBlocks)->getTerminator()); + auto *CreatedArray = Builder.CreateMalloc( + IntPtrTy, SAI->getElementType(), ConstantInt::get(Type::getInt64Ty(Ctx), Size), ConstantInt::get(Type::getInt64Ty(Ctx), ArraySizeInt), nullptr, SAI->getName()); @@ -1302,8 +1302,8 @@ void IslNodeBuilder::allocateNewArrays(BBPair StartExitBlocks) { SAI->setBasePtr(CreatedArray); // Insert the free call at polly.exiting - CallInst::CreateFree(CreatedArray, - std::get<1>(StartExitBlocks)->getTerminator()); + Builder.SetInsertPoint(std::get<1>(StartExitBlocks)->getTerminator()); + Builder.CreateFree(CreatedArray); } else { auto InstIt = Builder.GetInsertBlock() ->getParent() diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt index 58af4c8b131b0..010ec879e44a3 100644 --- a/runtimes/CMakeLists.txt +++ b/runtimes/CMakeLists.txt @@ -216,23 +216,6 @@ if(LLVM_INCLUDE_TESTS) umbrella_lit_testsuite_begin(check-runtimes) endif() -# llvm-libgcc incorporates both compiler-rt and libunwind as subprojects with very -# specific flags, which causes clashes when they're independently built too. -if("llvm-libgcc" IN_LIST runtimes) - if("compiler-rt" IN_LIST runtimes OR "compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS) - message(FATAL_ERROR - "Attempting to build both compiler-rt and llvm-libgcc will cause irreconcilable " - "target clashes. Please choose one or the other, but not both.") - endif() - - if("libunwind" IN_LIST runtimes) - message( - FATAL_ERROR - "Attempting to build both libunwind and llvm-libgcc will cause irreconcilable " - "target clashes. Please choose one or the other, but not both.") - endif() -endif() - # We do this in two loops so that HAVE_* is set for each runtime before the # other runtimes are added. foreach(entry ${runtimes}) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 5d1574162aa69..7ae9b6173ec72 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -8290,6 +8290,7 @@ cc_library( ":LinalgTransformOps", ":LinalgTransforms", ":MLProgramDialect", + ":MLProgramTransforms", ":MathDialect", ":MathToFuncs", ":MathToLLVM", @@ -9096,6 +9097,20 @@ td_library( gentbl_cc_library( name = "OpenMPOpsIncGen", tbl_outs = [ + ( + [ + "-gen-typedef-decls", + "-typedefs-dialect=omp", + ], + "include/mlir/Dialect/OpenMP/OpenMPOpsTypes.h.inc", + ), + ( + [ + "-gen-typedef-defs", + "-typedefs-dialect=omp", + ], + "include/mlir/Dialect/OpenMP/OpenMPOpsTypes.cpp.inc", + ), ( ["-gen-op-decls"], "include/mlir/Dialect/OpenMP/OpenMPOps.h.inc", @@ -11871,6 +11886,22 @@ gentbl_cc_library( deps = [":MLProgramOpsTdFiles"], ) +gentbl_cc_library( + name = "MLProgramPassIncGen", + tbl_outs = [ + ( + [ + "-gen-pass-decls", + "-name=MLProgram", + ], + "include/mlir/Dialect/MLProgram/Transforms/Passes.h.inc", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/MLProgram/Transforms/Passes.td", + deps = [":PassBaseTdFiles"], +) + gentbl_cc_library( name = "MLProgramTypesIncGen", tbl_outs = [ @@ -11890,13 +11921,8 @@ gentbl_cc_library( cc_library( name = "MLProgramDialect", - srcs = glob([ - "lib/Dialect/MLProgram/IR/*.cpp", - "lib/Dialect/MLProgram/IR/*.h", - ]), - hdrs = glob([ - "include/mlir/Dialect/MLProgram/IR/*.h", - ]), + srcs = glob(["lib/Dialect/MLProgram/IR/*.cpp"]), + hdrs = glob(["include/mlir/Dialect/MLProgram/IR/*.h"]), includes = ["include"], deps = [ ":ControlFlowInterfaces", @@ -11911,6 +11937,25 @@ cc_library( ], ) +cc_library( + name = "MLProgramTransforms", + srcs = glob([ + "lib/Dialect/MLProgram/Transforms/*.cpp", + ]), + hdrs = glob([ + "include/mlir/Dialect/MLProgram/Transforms/*.h", + ]), + includes = ["include"], + deps = [ + ":MLProgramDialect", + ":FuncDialect", + ":TransformUtils", + ":IR", + ":MLProgramPassIncGen", + ":Pass", + ], +) + ##---------------------------------------------------------------------------## # Allocation interfaces ##---------------------------------------------------------------------------##